Merge pull request #35 from clowder-framework/34-creating-extractors-…

…core-from-v1-to-v2 34 creating extractors core from v1 to v2
clowder-framework · Aug 18, 2023 · 65d2189 · 65d2189
2 parents fa110ee + 8f30910
commit 65d2189
Show file tree

Hide file tree

Showing 18 changed files with 120 additions and 83 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,34 +1,43 @@
 # Change Log
+
 All notable changes to this project will be documented in this file.
 
-The format is based on [Keep a Changelog](http://keepachangelog.com/)
-and this project adheres to [Semantic Versioning](http://semver.org/).
+The format is based on [Keep a Changelog](https://keepachangelog.com/)
+and this project adheres to [Semantic Versioning](https://semver.org/).
+
+## Unreleased
+
+### Changed
 
-# 2020-09-16
+- Audio preview, image preview, image metadata, and file digest extractors to support Clowder V2.
 
-Updated extractors to use pyclowder 2.3.1
+## 2020-09-16
 
-# 2020-07-07
+- Updated extractors to use pyclowder 2.3.1
+
+## 2020-07-07
 
 ## 1.0.1 - audio/speech2text
 
 ### Fixed
 
 - Updated dependencies for audio
 
-## 2.0.1 - pdf/preview 
+## 2.0.1 - pdf/preview
 
 ### Fixed
 
 - Fix pdf preview no authorization by updating policy.xml.
 
-# 2018-05-24
+## 2018-05-24
 
 ### Changed
+
 - All core extractors now use PyClowder2
 - Uses larger chunk size
 
 ### Fixed
+
 - Fixed bug in release shell script code.
 - Fixed maintainer name spelling
 - Use 10KB blocks when computing digest for faster computation.
diff --git a/README.md b/README.md
@@ -1,15 +1,36 @@
-This repository will contain the core extractors that most people would like to install:
+# Clowder Core Extractors
+
+This repository contain the core extractors that most people would like to install:
 
 - image  : creates image previews, thumbnails and extracts metadata
 - video  : creates image previews, thumbnails and video previews
 - audio  : creates image previews, thumbnails and audio previews
 - pdf    : creates image previews, thumbnails and pdf previews
 - office : creates image previews, thumbnails and pdf previews
 
-You can build the docker containers using the command: `./docker.sh`
-
 To test these extractors use the docker-compose file in clowder.
 
-There is only a master branch, any changes made should be made to master. When making
-changes to the code, bump the version in the extractor_info.json as well as update the
+All code pull requests in this repo should be created against the `main` branch, and it's the default one. When making
+changes to the code, bump the version in the `extractor_info.json` as well as update the
 CHANGELOG.md
+
+## Support for Clowder V2
+
+When running extractors against a Clowder V2 instance, pass the environment variable `CLOWDER_VERSION=2` to the running
+Python program or Docker container.
+
+### Docker Build and Run Instructions for Clowder V2
+
+CD into the directory containing the specific extractor:
+
+```shell
+docker build -t <extractor-docker-image-name> .
+docker run --rm -e CLOWDER_VERSION=2 -e "RABBITMQ_URI=amqp://guest:guest@rabbitmq:5672/%2f" --network <clowder_nework_name> <extractor-docker-image-name>
+```
+
+E.g., For image preview extractor, the build and run commands could be:
+
+```shell
+docker build -t clowder/extractors-image-preview .
+docker run --rm -e CLOWDER_VERSION=2 -e "RABBITMQ_URI=amqp://guest:guest@rabbitmq:5672/%2f" --network clowder2-dev_clowder2 clowder/extractors-image-preview
+```
diff --git a/audio/preview/Dockerfile b/audio/preview/Dockerfile
@@ -8,6 +8,7 @@ ARG GITSHA1="unknown"
 ENV VERSION=${VERSION} \
     BUILDNUMBER=${BUILDNUMBER} \
     GITSHA1=${GITSHA1} \
+    CLOWDER_VERSION=1 \
     RABBITMQ_QUEUE="ncsa.audio.preview" \
     IMAGE_BINARY="/usr/bin/sox" \
     IMAGE_TYPE="png" \

diff --git a/audio/preview/binary_extractor.py b/audio/preview/binary_extractor.py
@@ -6,13 +6,14 @@
 import subprocess
 import tempfile
 
-from pyclowder.extractors import Extractor
 import pyclowder.files
 import pyclowder.utils
+from pyclowder.extractors import Extractor
 
 
 class BinaryPreviewExtractor(Extractor):
-    """Count the number of characters, words and lines in a text file."""
+    """Generate audio preview and upload to Clowder"""
+
     def __init__(self):
         Extractor.__init__(self)
 
@@ -45,7 +46,7 @@ def __init__(self):
         # parse command line and load default logging configuration
         self.setup()
 
-        # setup logging for the exctractor
+        # setup logging for the extractor
         logging.getLogger('pyclowder').setLevel(logging.DEBUG)
         logging.getLogger('__main__').setLevel(logging.DEBUG)
 
@@ -61,27 +62,29 @@ def process_message(self, connector, host, secret_key, resource, parameters):
         else:
             args = self.args.image_thumbnail_command
         self.execute_command(connector, host, secret_key, inputfile, file_id, resource, False,
-                             self.args.image_binary, args, self.args.image_type)
+                             self.args.image_binary, args, self.args.image_type, self.extractor_info["name"])
 
         # create preview image
         if 'image_preview' in parameters:
             args = parameters['image_preview']
         else:
             args = self.args.image_preview_command
         self.execute_command(connector, host, secret_key, inputfile, file_id, resource, True,
-                             self.args.image_binary, args, self.args.image_type)
+                             self.args.image_binary, args, self.args.image_type, self.extractor_info["name"])
 
-        # create extractor specifc preview
+        # create extractor specific preview
         if 'preview' in parameters:
             args = parameters['preview']
         else:
             args = self.args.preview_command
         self.execute_command(connector, host, secret_key, inputfile, file_id, resource, True,
-                             self.args.preview_binary, args, self.args.preview_type)
+                             self.args.preview_binary, args, self.args.preview_type, self.extractor_info["name"])
 
     @staticmethod
-    def execute_command(connector, host, key, inputfile, fileid, resource, preview, binary, commandline, ext):
+    def execute_command(connector, host, key, inputfile, fileid, resource, preview, binary, commandline, ext,
+                        extractor_name):
         logger = logging.getLogger(__name__)
+        clowder_version = int(os.getenv('CLOWDER_VERSION', '1'))
 
         if binary is None or binary == '' or commandline is None or commandline == '' or ext is None or ext == '':
             return
@@ -109,7 +112,12 @@ def execute_command(connector, host, key, inputfile, fileid, resource, preview,
             if os.path.getsize(tmpfile) != 0:
                 # upload result
                 if preview:
-                    pyclowder.files.upload_preview(connector, host, key, fileid, tmpfile, None)
+                    if clowder_version == 2:
+                        pyclowder.files.upload_preview(connector, host, key, fileid, tmpfile, None, "audio/" + ext,
+                                                       visualization_name=extractor_name,
+                                                       visualization_component_id="basic-audio-component")
+                    else:
+                        pyclowder.files.upload_preview(connector, host, key, fileid, tmpfile, None)
                     connector.status_update(pyclowder.utils.StatusMessage.processing, resource,
                                             "Uploaded preview of type %s" % ext)
                 else:

diff --git a/audio/preview/extractor_info.json b/audio/preview/extractor_info.json
@@ -1,10 +1,12 @@
 {
-  "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
+  "@context": "https://clowderframework.org/contexts/extractors.jsonld",
   "name": "ncsa.audio.preview",
-  "version": "2.1.9",
-  "description": "Creates thumbnail and image previews of Audio files.",
+  "version": "2.2.0",
+  "description": "Creates audio previews.",
   "author": "Rob Kooper <[email protected]>",
-  "contributors": [],
+  "contributors": [
+    "Sandeep Puthanveetil Satheesan <[email protected]>"
+  ],
   "contexts": [],
   "repository": [
     {

diff --git a/audio/preview/requirements.txt b/audio/preview/requirements.txt
@@ -1 +1 @@
-pyclowder==2.7.0
+pyclowder==3.0.2
diff --git a/digest/Dockerfile b/digest/Dockerfile
@@ -7,6 +7,7 @@ ARG GITSHA1="unknown"
 # environemnt variables
 ENV VERSION=${VERSION} \
     BUILDNUMBER=${BUILDNUMBER} \
+    CLOWDER_VERSION=1 \
     GITSHA1=${GITSHA1} \
     STREAM=requests \
     RABBITMQ_QUEUE="ncsa.file.digest" \

diff --git a/digest/extractor_info.json b/digest/extractor_info.json
@@ -4,7 +4,9 @@
   "version": "2.2.4",
   "description": "File digest extractor",
   "author": "Max Burnette <[email protected]>",
-  "contributors": [],
+  "contributors": [
+    "Sandeep Puthanveetil Satheesan <[email protected]>"
+  ],
   "contexts": [],
   "repository": [
     {

diff --git a/digest/ncsa.file.digest.py b/digest/ncsa.file.digest.py
@@ -3,14 +3,13 @@
 import hashlib
 import logging
 import os
-import requests
-import pycurl
-import certifi
-import json
 
+import certifi
+import pyclowder.files
+import pycurl
+import requests
 from pyclowder.extractors import Extractor
 from pyclowder.utils import CheckMessage
-import pyclowder.files
 
 
 class FileDigestCalculator(Extractor):
@@ -26,7 +25,7 @@ def __init__(self):
         # parse command line and load default logging configuration
         self.setup()
 
-        # setup logging for the exctractor
+        # setup logging for the extractor
         logging.getLogger('pyclowder').setLevel(logging.DEBUG)
         logging.getLogger('__main__').setLevel(logging.DEBUG)
 
@@ -41,13 +40,13 @@ def stream_requests(self, connector, url, hashes):
         # Stream file and update hashes
         r = requests.get(url, stream=True, verify=connector.ssl_verify if connector else True)
         for chunk in r.iter_content(chunk_size=10240):
-            for hash in hashes.values():
-                hash.update(chunk)
+            for hash_value in hashes.values():
+                hash_value.update(chunk)
 
     def stream_pycurl(self, connector, url, hashes):
         def hash_data(data):
-            for hash in hashes.values():
-                hash.update(data)
+            for hash_value in hashes.values():
+                hash_value.update(data)
 
         c = pycurl.Curl()
         if (connector and not connector.ssl_verify) or (os.getenv("SSL_IGNORE", "").lower() == "true"):
@@ -61,15 +60,16 @@ def hash_data(data):
 
     def process_message(self, connector, host, secret_key, resource, parameters):
         logger = logging.getLogger('__main__')
-        url = '%sapi/files/%s/blob?key=%s&tracking=false' % (host, resource['id'], secret_key)
+        file_id = resource['id']
+        url = '%sapi/files/%s/blob?key=%s&tracking=false' % (host, file_id, secret_key)
 
         # Prepare hash objects
         hashes = {}
         for alg in self.hash_list:
             hashes[alg] = hashlib.new(alg)
 
         # stream data and compute hash
-        logger.debug("sending request for digest streaming: "+url)
+        logger.debug("sending request for digest streaming: " + url)
         if os.getenv('STREAM', '').lower() == 'pycurl':
             self.stream_pycurl(connector, url, hashes)
         else:
@@ -86,17 +86,8 @@ def process_message(self, connector, host, secret_key, resource, parameters):
             hash_context[alg] = "http://www.w3.org/2001/04/xmldsig-more#%s" % alg
 
         # store results as metadata
-        metadata = {
-            "@context": ["https://clowder.ncsa.illinois.edu/contexts/metadata.jsonld", hash_context],
-            "dataset_id": resource['parent'].get('id', None),
-            "content": hash_digest,
-            "agent": {
-                "@type": "cat:extractor",
-                "extractor_id": host + "api/extractors/" + self.extractor_info['name']
-            }
-        }
-
-        pyclowder.files.upload_metadata(connector, host, secret_key, resource['id'], metadata)
+        metadata = self.get_metadata(hash_digest, 'file', file_id, host)
+        pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
 
 
 if __name__ == "__main__":

diff --git a/digest/requirements.txt b/digest/requirements.txt
@@ -1,2 +1,2 @@
-pyclowder==2.7.0
+pyclowder==3.0.2
 pycurl==7.43.0.6
diff --git a/image/metadata/Dockerfile b/image/metadata/Dockerfile
@@ -8,6 +8,7 @@ ARG GITSHA1="unknown"
 ENV VERSION=${VERSION} \
     BUILDNUMBER=${BUILDNUMBER} \
     GITSHA1=${GITSHA1} \
+    CLOWDER_VERSION=1 \
     RABBITMQ_QUEUE="ncsa.image.metadata" \
     IMAGE_BINARY="/usr/bin/identify"
 

diff --git a/image/metadata/extractor_info.json b/image/metadata/extractor_info.json
@@ -1,11 +1,12 @@
 {
-  "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
+  "@context": "http://www.w3.org/2003/12/exif/ns",
   "name": "ncsa.image.metadata",
-  "version": "2.1.9",
+  "version": "2.2.0",
   "description": "Extracts metadata from Image files.",
   "author": "Max Burnette <[email protected]>",
   "contributors": [
-    "Rob Kooper <[email protected]>"
+    "Rob Kooper <[email protected]>",
+    "Sandeep Puthanveetil Satheesan <[email protected]>"
   ],
   "contexts": [],
   "repository": [

diff --git a/image/metadata/ncsa.image.metadata.py b/image/metadata/ncsa.image.metadata.py
@@ -5,13 +5,14 @@
 import re
 import subprocess
 
-from pyclowder.extractors import Extractor
 import pyclowder.files
 import pyclowder.utils
+from pyclowder.extractors import Extractor
 
 
 class ImageMetadataExtractor(Extractor):
     """Count the number of characters, words and lines in a text file."""
+
     def __init__(self):
         Extractor.__init__(self)
 
@@ -35,19 +36,9 @@ def process_message(self, connector, host, secret_key, resource, parameters):
         file_id = resource['id']
 
         result = self.parse_exif(subprocess.check_output(
-            [self.args.image_binary, "-verbose", inputfile], stderr=subprocess.STDOUT).decode("utf-8"))
-
-        metadata = {
-            "@context": {
-                "@vocab": "http://www.w3.org/2003/12/exif/ns"
-            },
-            "file_id": file_id,
-            "content": result,
-            "agent": {
-                "@type": "cat:extractor",
-                "extractor_id": host + "/api/extractors/ncsa.image.metadata"
-            }
-        }
+            [self.args.image_binary, "-verbose", inputfile], stderr=subprocess.STDOUT).decode("utf-8", errors="ignore"))
+
+        metadata = self.get_metadata(result, 'file', file_id, host)
         pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata)
 
     def fix_map(self, data):
@@ -131,8 +122,8 @@ def parse_exif(self, text):
                         ".*Geometry$",
                         key,
                         flags=re.IGNORECASE) and re.match(
-                        "^\d+x\d+\+\d+\+\d+$",
-                        value):
+                    "^\d+x\d+\+\d+\+\d+$",
+                    value):
                     # Extract width and height from "_Geometry" (e.g "Pagegeometry") property if present.
                     # value must be in format "INTxINT+INT+INT".
                     # Full raw entry will still be added to final result.
@@ -199,7 +190,7 @@ def parse_exif(self, text):
                 print("Skipping : " + line)
 
         # Add raw source onto the primary dictionary object and return it
-        #data[0]["raw"] = text
+        # data[0]["raw"] = text
         return self.fix_map(data[0])
 
 

diff --git a/image/metadata/requirements.txt b/image/metadata/requirements.txt
@@ -1 +1 @@
-pyclowder==2.7.0
+pyclowder==3.0.2
diff --git a/image/preview/Dockerfile b/image/preview/Dockerfile
@@ -8,6 +8,7 @@ ARG GITSHA1="unknown"
 ENV VERSION=${VERSION} \
     BUILDNUMBER=${BUILDNUMBER} \
     GITSHA1=${GITSHA1} \
+    CLOWDER_VERSION=1 \
     RABBITMQ_QUEUE="ncsa.image.preview" \
     IMAGE_BINARY="/usr/bin/convert" \
     IMAGE_TYPE="png" \