Skip to content

Commit

Permalink
Merge pull request #2 from NYU-DataServices/schema-validation
Browse files Browse the repository at this point in the history
Schema validation
  • Loading branch information
mnyrop committed Dec 1, 2023
2 parents 18799c8 + 95826e0 commit 7d827fc
Show file tree
Hide file tree
Showing 4,209 changed files with 246,065 additions and 175,373 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
11 changes: 9 additions & 2 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
name: lint records
on: [push, pull_request]
on:
push:
paths-ignore:
- '**.md'
pull_request:
paths-ignore:
- '**.md'

jobs:
lint:
Expand All @@ -10,6 +16,7 @@ jobs:
uses: ruby/setup-ruby@v1
with:
bundler-cache: true
# run linters
- name: lint v1 records
run: bundle exec rake lint:v1
- name: lint aardvark records
run: bundle exec rake lint:aardvark
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
source 'https://rubygems.org'

gem 'geo_combine'
gem 'json_schemer'
gem 'rake'
gem 'ruby-progressbar'
gem 'sdr_cli', github: 'NYULibraries/sdr-cli', branch: 'main'
70 changes: 12 additions & 58 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -11,81 +11,35 @@ GIT
GEM
remote: https://rubygems.org/
specs:
activesupport (7.1.2)
base64
bigdecimal
concurrent-ruby (~> 1.0, >= 1.0.2)
connection_pool (>= 2.2.5)
drb
i18n (>= 1.6, < 2)
minitest (>= 5.1)
mutex_m
tzinfo (~> 2.0)
addressable (2.8.5)
public_suffix (>= 2.0.2, < 6.0)
base64 (0.2.0)
bigdecimal (3.1.4)
builder (3.2.4)
concurrent-ruby (1.2.2)
connection_pool (2.4.1)
crass (1.0.6)
dotenv (2.8.1)
drb (2.2.0)
ruby2_keywords
faraday (2.7.11)
base64
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
faraday-net_http (3.0.2)
faraday-net_http_persistent (2.1.0)
faraday (~> 2.5)
net-http-persistent (~> 4.0)
geo_combine (0.8.0)
activesupport
faraday-net_http_persistent (~> 2.0)
git
json-schema
nokogiri
rsolr
sanitize
thor
git (1.18.0)
addressable (~> 2.8)
rchardet (~> 1.8)
i18n (1.14.1)
concurrent-ruby (~> 1.0)
json-schema (4.1.1)
addressable (>= 2.8)
minitest (5.20.0)
mutex_m (0.2.0)
net-http-persistent (4.0.2)
connection_pool (~> 2.2)
nokogiri (1.15.4-arm64-darwin)
racc (~> 1.4)
nokogiri (1.15.4-x86_64-linux)
racc (~> 1.4)
public_suffix (5.0.3)
racc (1.7.3)
hana (1.3.7)
json_schemer (2.1.1)
hana (~> 1.3)
regexp_parser (~> 2.0)
simpleidn (~> 0.2)
rake (13.0.6)
rchardet (1.8.0)
rsolr (2.5.0)
builder (>= 2.1.2)
faraday (>= 0.9, < 3, != 2.0.0)
regexp_parser (2.8.2)
ruby-progressbar (1.13.0)
ruby2_keywords (0.0.5)
sanitize (6.1.0)
crass (~> 1.0.2)
nokogiri (>= 1.12.0)
simpleidn (0.2.1)
unf (~> 0.1.4)
thor (1.2.2)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.9.1)

PLATFORMS
arm64-darwin-22
x86_64-linux

DEPENDENCIES
geo_combine
json_schemer
rake
ruby-progressbar
sdr_cli!
Expand Down
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# NYU Libraries GeoBlacklight Metadata Repository

This repository is the most current source for geospatial metadata housed within the NYU Libraries collection, the [Spatial Data Repository](geo.nyu.edu).
[OpenGeoMetadata/edu.nyu](https://github.com/OpenGeoMetadata/edu.nyu) is the cannonical, most current source for geospatial metadata housed within the NYU Libraries collection in out [Spatial Data Repository](geo.nyu.edu).

You might currently be looking at [OpenGeoMetadata/edu.nyu](https://github.com/OpenGeoMetadata/edu.nyu) or at NYU's internal fork [NYU-DataServices/gis-metadata-staging](https://github.com/NYU-DataServices/gis-metadata-staging) — the latter is where NYU staff works on in-process records to lint and stage in our staging instance.

[OpenGeoMetadata/edu.nyu](https://github.com/OpenGeoMetadata/edu.nyu) should never be committed to directly—it should only take pull requests from [NYU-DataServices/gis-metadata-staging](https://github.com/NYU-DataServices/gis-metadata-staging).






45 changes: 19 additions & 26 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
require 'geo_combine'
require 'ruby-progressbar'
require_relative 'lib/lint'

namespace :lint do
AARDVARK_SCHEMA_URL = 'https://opengeometadata.org/schema/geoblacklight-schema-aardvark.json'
OGM_V1_SCHEMA_URL = 'https://opengeometadata.org/schema/geoblacklight-schema-1.0.json'

desc "lint version 1 geoblacklight.json records"
task :v1 do
paths = Dir.glob("./metadata-1.0/**/*/geoblacklight.json")
records_invalid = 0
records_valid = 0
invalid_paths = []
progess_bar = ProgressBar.create format: "Linting record %c/%C (%P% complete ) — %e", total: paths.length

paths.each do |path|
rec = GeoCombine::Geoblacklight.new(File.read(path))
begin
rec.valid?
records_valid += 1
rescue
records_invalid += 1
invalid_paths << path
end
progess_bar.increment
end

if records_invalid > 0
raise "Contains #{records_invalid} invalid records:\n#{invalid_paths}"
else
puts "All records passed ✅"
end
puts "\nOGM v1 ~>"
paths = Dir.glob("./metadata-1.0/**/*/geoblacklight.json")
lint paths, OGM_V1_SCHEMA_URL
end
desc "lint aardvark geoblacklight.json records"
task :aardvark do
puts "\nAARDVARK ~>"
paths = Dir.glob("./metadata-aardvark/*/**/*.json")
lint paths, AARDVARK_SCHEMA_URL
end
desc "lint all records"
task :all do
Rake::Task['lint:v1'].execute
Rake::Task['lint:aardvark'].execute
end
end

# task :default => ["validate_v1"]
task :default => ["lint:all"]
31 changes: 31 additions & 0 deletions lib/lint.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
require 'json'
require 'json_schemer'
require 'open-uri'
require 'ruby-progressbar'

def lint(paths, schema_url)
invalid = []
schemer = JSONSchemer.schema JSON.load(URI.open(schema_url))
bar = ProgressBar.create format: "Linting record %c/%C (%P% complete ) — %e", total: paths.length

paths.each do |path|
record = JSON.parse File.read(path)
id = record['layer_id_s'] || record['id']

invalid << {
'id' => id,
'errors' => schemer.validate(record).map { |x| x['error'] }
} unless schemer.valid?(record)
bar.increment
end

if invalid.empty?
puts "All #{paths.length} records passed ✅"
else
puts "#{invalid.length}/#{paths.length} records have failed schema validation:"
invalid.each do |i|
puts "❌ #{i['id']}"
i['errors'].each { |e| puts "\t #{e}" }
end
end
end
81 changes: 50 additions & 31 deletions metadata-aardvark/Datasets/nyu-2451-33876.json
Original file line number Diff line number Diff line change
@@ -1,32 +1,51 @@
{
"dct_description_sm": "This polygon shapefile contains estimates of the number of Jewish persons in the U.S. by county in 2010 census geographies, based upon multiple sources of data, including Jewish community studies available at the Jewish Data Bank, the Data Bank's Current Jewish Population Report series, and the American Community Survey. Joshua Comenetz used indicators like language spoken and ancestry in the absence of standard demographic data to measure Jewish populations. For more information on the methodology used to compile this data set, see the Jewish Data Bank.",
"dct_format_s": "Shapefile",
"dct_identifier_sm": "http://hdl.handle.net/2451/33876",
"dct_language_sm": "English",
"dct_accessRights_s": "Public",
"dct_subject_sm": [
"Demographics",
"Religion",
"Judaism"
],
"dct_title_s": "National Jewish Population Map by County",
"gbl_resourceClass_sm": "Datasets",
"dct_isPartOf_sm": [],
"schema_provider_s": "NYU",
"dct_references_s": "{\"http://schema.org/url\":\"http://hdl.handle.net/2451/33876\",\"http://schema.org/downloadUrl\":\"https://archive.nyu.edu/bitstream/2451/33876/2/nyu_2451_33876.zip\",\"http://www.opengis.net/def/serviceType/ogc/wfs\":\"https://maps-public.geo.nyu.edu/geoserver/sdr/wfs\",\"http://www.opengis.net/def/serviceType/ogc/wms\":\"https://maps-public.geo.nyu.edu/geoserver/sdr/wms\"}",
"dct_source_sm": [],
"dct_spatial_sm": [
"United States"
],
"dct_temporal_sm": [
"2010"
],
"gbl_mdVersion_s": "Aardvark",
"gbl_resourceType_sm": "Polygon data",
"gbl_wxsIdentifier_s": "sdr:nyu_2451_33876",
"gbl_mdModified_dt": "2020-11-25T08:00:43Z",
"id": "nyu-2451-33876",
"nyu_addl_dspace_s": "34679",
"locn_geometry": "ENVELOPE(-179.14734, 179.778465, 71.390482, 17.881242)",
"gbl_indexYear_im": 2010
{
"dct_description_sm": [
"This polygon shapefile contains estimates of the number of Jewish persons in the U.S. by county in 2010 census geographies, based upon multiple sources of data, including Jewish community studies available at the Jewish Data Bank, the Data Bank's Current Jewish Population Report series, and the American Community Survey. Joshua Comenetz used indicators like language spoken and ancestry in the absence of standard demographic data to measure Jewish populations. For more information on the methodology used to compile this data set, see the Jewish Data Bank."
],
"dct_format_s": "Shapefile",
"dct_identifier_sm": [
"http://hdl.handle.net/2451/33876"
],
"dct_language_sm": [
"English"
],
"dct_accessRights_s": "Public",
"dct_subject_sm": [
"Demographics",
"Religion",
"Judaism"
],
"dct_title_s": "National Jewish Population Map by County",
"gbl_resourceClass_sm": [
"Datasets"
],
"dct_isPartOf_sm": [

],
"schema_provider_s": "NYU",
"dct_references_s": "{\"http://schema.org/url\":\"http://hdl.handle.net/2451/33876\",\"http://schema.org/downloadUrl\":\"https://archive.nyu.edu/bitstream/2451/33876/2/nyu_2451_33876.zip\",\"http://www.opengis.net/def/serviceType/ogc/wfs\":\"https://maps-public.geo.nyu.edu/geoserver/sdr/wfs\",\"http://www.opengis.net/def/serviceType/ogc/wms\":\"https://maps-public.geo.nyu.edu/geoserver/sdr/wms\"}",
"dct_source_sm": [

],
"dct_spatial_sm": [
"United States"
],
"dct_temporal_sm": [
"2010"
],
"gbl_mdVersion_s": "Aardvark",
"gbl_resourceType_sm": [
"Polygon data"
],
"gbl_wxsIdentifier_s": "sdr:nyu_2451_33876",
"gbl_mdModified_dt": "2020-11-25T08:00:43Z",
"id": "nyu-2451-33876",
"nyu_addl_dspace_s": "34679",
"locn_geometry": "ENVELOPE(-179.14734, 179.778465, 71.390482, 17.881242)",
"gbl_indexYear_im": [
2010
],
"dct_publisher_sm": [
""
]
}
Loading

0 comments on commit 7d827fc

Please sign in to comment.