Merge pull request #53 from aws-solutions-library-samples/52-video-an…

…alysis-fails-for-short-form-video-5-seconds-long bugfixes: #51, #52
aws-solutions-library-samples · Apr 11, 2024 · 8f2d05f · 8f2d05f
2 parents c0dc3d6 + 32261a7
commit 8f2d05f
Show file tree

Hide file tree

Showing 14 changed files with 140 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [4.0.1] - 2024-04-11
+### Bugfixes
+- AWS Elemental MediaConvert does not create frameCapture group when s3://[PROXY_BUCKET]/_settings/aioption.json is missing.
+- Short form video (5s) fails the analysis.
+- Rephrase the Version Compatibility input parameter on the CFN template to be more clear.
+
+### New features
+- added Top 5 most relevant tags at the scene level
+
+
 ## [4.0.0] - 2024-03-06
 ### New features
 - Dynamic frame analysis workflow

diff --git a/deployment/media2cloud-backend-stack.yaml b/deployment/media2cloud-backend-stack.yaml
@@ -3519,6 +3519,8 @@ Resources:
                         - bOpenSearchServerless
                         - 1
                         - 0
+                    ENV_DEFAULT_AI_OPTIONS: !Ref DefaultAIOptions
+                    ENV_DEFAULT_MINCONFIDENCE: !Ref DefaultMinConfidence
                     ENV_AI_OPTIONS_S3KEY: !Ref AIOptionsS3Key
 
     IngestMainStateMachine:

diff --git a/deployment/media2cloud.yaml b/deployment/media2cloud.yaml
@@ -161,7 +161,6 @@ Parameters:
     VersionCompatibilityStatement:
         Type: String
         Description: The new Version 4 of Media2Cloud is not compatible with previous versions due to several optimization changes. These changes include the Amazon OpenSearch cluster indexes and consolidation of the generated metadata. While a migration tool is being developed to help customers migrate from previous versions to Version 4, this CloudFormation template SHOULD NOT be used to update your existing Media2Cloud V3 deployment to the latest version. Confirm that you have read and understand the version compatibility statement. If you are creating a new stack, select "Yes, I understand and proceed".
-        Default: No, do not proceed
         AllowedValues:
             - Yes, I understand and proceed
             - No, do not proceed
@@ -171,7 +170,7 @@ Metadata:
         ParameterGroups:
             -
                 Label:
-                    default: Version Compatibility
+                    default: PLEASE READ AND SELECT AN ANSWER
                 Parameters:
                     - VersionCompatibilityStatement
             -
@@ -214,7 +213,7 @@ Metadata:
                     - BedrockModel
         ParameterLabels:
             VersionCompatibilityStatement:
-                default: Please read the following statement
+                default: Version compatibility
             Email:
                 default: Email
             PriceClass:

diff --git a/source/api/lib/operations/genai/claude.js b/source/api/lib/operations/genai/claude.js
@@ -383,7 +383,7 @@ function _createCustomPrompt(options) {
   const transcript = _textInput(options);
   messages.push({
     role: 'user',
-    content: `Transcript in <transcript> tag:\n<transcript>${transcript}\n</transcript>`,
+    content: `Transcript in <transcript> tag:\n<transcript>${transcript}\n</transcript>\n${options.prompt}`,
   });
 
   messages.push({

diff --git a/source/custom-resources/lib/versionCompatibility/index.js b/source/custom-resources/lib/versionCompatibility/index.js
@@ -18,7 +18,7 @@ exports.CheckVersionCompatibilityStatement = async (event, context) => {
     return x0.responseData;
   }
 
-  let consent = event.ResourceProperties.Data.VersionCompatibilityStatement;
+  let consent = event.ResourceProperties.Data.VersionCompatibilityStatement || '';
   consent = consent.toLowerCase();
 
   if (consent.startsWith('yes')) {

diff --git a/source/layers/core-lib/lib/.version b/source/layers/core-lib/lib/.version
@@ -1 +1 @@
-4.0.0
+4.0.1
diff --git a/source/main/analysis/post-process/states/ad-break/index.js b/source/main/analysis/post-process/states/ad-break/index.js
@@ -746,7 +746,7 @@ function _bestGuessCandidates(scenes) {
   }
 
   // special case: content does not have end credits
-  if (contentTimestamps[1] < 0) {
+  if (contentTimestamps[1] < 0 && scenes.length > 0) {
     contentTimestamps[1] = scenes[scenes.length - 1].timeEnd;
   }
 

diff --git a/source/main/analysis/post-process/states/create-scene-taxonomy/index.js b/source/main/analysis/post-process/states/create-scene-taxonomy/index.js
@@ -35,8 +35,9 @@ const BaseState = require('../shared/base');
 const MODEL_REGION = process.env.ENV_BEDROCK_REGION;
 const MODEL_ID = process.env.ENV_BEDROCK_MODEL_ID;
 const MODEL_VERSION = process.env.ENV_BEDROCK_MODEL_VER;
-const SYSTEM = 'You are a media operation engineer. Your job is to review a portion of a video content presented by a sequence of consecutive images. Each image also contains a sequence of frames presented in a 4x7 grid reading from left to right and then from top to bottom. You may also optionally be given the dialogues of the scene that helps you to understand the context. You are asked to provide the following information: a detail description to describe the scene, identify the most relevant IAB taxonomy, GARM, sentiment, and brands and logos that may appear in the scene. It is important to return the results in JSON format and also includes a confidence score from 0 to 100. Skip any explanation.';
-const SYSTEM_IAB = 'You are a media operation engineer. Your job is to review a portion of a video content presented by a sequence of consecutive images. Each image also contains a sequence of frames presented in a 4x7 grid reading from left to right and then from top to bottom. You may also optionally be given the dialogues of the scene that helps you to understand the context. You are asked to identify the most relevant IAB taxonomy. It is important to return the results in JSON format and also includes a confidence score from 0 to 100. Skip any explanation.';
+const TASK_ALL = 'You are asked to provide the following information: a detail description to describe the scene, identify the most relevant IAB taxonomy, GARM, sentiment, and brands and logos that may appear in the scene, and five most relevant tags from the scene.';
+const TASK_IAB = 'You are asked to identify the most relevant IAB taxonomy.';
+const SYSTEM = 'You are a media operation engineer. Your job is to review a portion of a video content presented by a sequence of consecutive images. Each image also contains a sequence of frames presented in a 4x7 grid reading from left to right and then from top to bottom. You may also optionally be given the dialogues of the scene that helps you to understand the context. {{TASK}} It is important to return the results in JSON format and also includes a confidence score from 0 to 100. Skip any explanation.';
 const ASSISTANT = {
   ProvideDialogues: {
     role: 'assistant',
@@ -62,7 +63,7 @@ const MODEL_PARAMS = {
   // top_p: 0.8,
   // top_k: 250,
   stop_sequences: ['\n\nHuman:'],
-  system: SYSTEM,
+  // system: SYSTEM,
 };
 
 const ENABLE_IMAGE_TILE = false;
@@ -701,6 +702,12 @@ async function _inference(
     text: `Here is a list of Sentiments in <sentiment> tag:\n<sentiment>\n${sentiments.join('\n')}\n</sentiment>\nOnly answer the Sentiment from this list.`,
   });
 
+  // tags
+  additional.push({
+    type: 'text',
+    text: 'Also provide five most relevant tags of the scene.',
+  });
+
   messages.push({
     role: 'user',
     content: additional,
@@ -736,6 +743,12 @@ async function _inference(
         score: 90,
       },
     ],
+    tags: [
+      {
+        text: 'BMW',
+        score: 90,
+      },
+    ],
   };
 
   const output = `Return JSON format. An example of the output:\n${JSON.stringify(example)}\n`;
@@ -747,10 +760,12 @@ async function _inference(
   // assistant
   messages.push(ASSISTANT.Prefill);
 
+  const system = SYSTEM.replace('{{TASK}}', TASK_ALL);
   const modelParams = {
     ...MODEL_PARAMS,
     ...options,
     messages,
+    system,
   };
 
   const response = await _invokeEndpoint(modelId, modelParams);
@@ -1083,7 +1098,7 @@ async function _inferenceRefineIAB(
   // guardrail to only return JSON
   messages.push(ASSISTANT.Prefill);
 
-  const system = SYSTEM_IAB;
+  const system = SYSTEM.replace('{{TASK}}', TASK_IAB);
   const modelId = MODEL_ID;
   const modelParams = {
     ...MODEL_PARAMS,

diff --git a/source/main/analysis/video/states/select-segment-frames/index.js b/source/main/analysis/video/states/select-segment-frames/index.js
@@ -116,6 +116,10 @@ class StateSelectSegmentFrames {
         segments
       );
 
+      if (frameSegmentation.length === 0) {
+        throw new AnalysisError('no frame being selected');
+      }
+
       console.log(`[INFO]: StateSelectSegmentFrames.process: ${frameSegmentation.length} out of ${framesExtracted}`);
 
       const {

diff --git a/source/main/analysis/video/states/select-segment-frames/selectionHelper.js b/source/main/analysis/video/states/select-segment-frames/selectionHelper.js
@@ -7,7 +7,7 @@ const Jimp = require('jimp');
 const TYPE_STEADY = ['ColorBars', 'BlackFrames', 'StudioLogo', 'Slate'];
 const TYPE_CREDITS = ['EndCredits'];
 const TYPE_OPENING = ['OpeningCredits'];
-const TYPE_CONTENT = ['Content'];
+const TYPE_CONTENT = ['Content', 'undefined'];
 const HAMMING_DISTANCE_THRESHOLD = 0.85;
 const SPLIT_INTERVAL = 2 * 60 * 1000; // 2min
 const SAMPLING_INTERVAL = 3 * 1000; // 3s
@@ -48,6 +48,18 @@ function _withShotSegment(frameHashes, segments) {
       }
     });
 
+  // special case: potentially short form video. Fake the technicalCue.
+  if (technicalCues.length === 0) {
+    shotSegments.forEach((shotSegment) => {
+      technicalCues.push({
+        ShotSegmentRange: [shotSegment.ShotSegment.Index, shotSegment.ShotSegment.Index],
+        TechnicalCueSegment: {
+          Type: 'undefined',
+        },
+      });
+    });
+  }
+
   let selected = [];
   let shotIdx = 0;
 
@@ -178,7 +190,6 @@ function _selectFromShotSegment(
       Math.round((send - ssta) / SAMPLING_INTERVAL),
       1
     );
-
     selected = _selectByScanning(shotSegmentFrames, maxFrames);
   } else {
     console.log(`[INFO]: [#${shotIdx}]: ${technicalCueType}: not supported`);

diff --git a/source/main/ingest/main/states/create-record/index.js b/source/main/ingest/main/states/create-record/index.js
@@ -3,6 +3,7 @@
 
 const PATH = require('path');
 const {
+  aimlGetPresets,
   DB,
   CommonUtils,
   MimeTypeHelper,
@@ -13,6 +14,7 @@ const {
   IngestError,
 } = require('core-lib');
 
+const DEFAULT_AI_OPTIONS = process.env.ENV_DEFAULT_AI_OPTIONS;
 const AI_OPTIONS_S3KEY = process.env.ENV_AI_OPTIONS_S3KEY;
 
 class StateCreateRecord {
@@ -138,6 +140,11 @@ class StateCreateRecord {
           undefined);
     }
 
+    // load from environment variable
+    if (!options) {
+      options = aimlGetPresets(DEFAULT_AI_OPTIONS);
+    }
+
     /* auto select frameCaptureMode if not defined */
     if (options
       && options[AnalysisTypes.Rekognition.CustomLabel]

diff --git a/.../webapp/src/lib/js/app/mainView/collection/base/components/analysis/adbreak/adbreakTab.js b/.../webapp/src/lib/js/app/mainView/collection/base/components/analysis/adbreak/adbreakTab.js
@@ -463,6 +463,16 @@ export default class AdBreakTab extends BaseAnalysisTab {
       .addClass('lead-s b-300');
     section.append(ulBrandAndLogos);
 
+    // Tags
+    const tags = $('<p/>')
+      .addClass('b-300 mr-4')
+      .append('Top 5 relevant tags');
+    section.append(tags);
+
+    const ulTags = $('<ul/>')
+      .addClass('lead-s b-300');
+    section.append(ulTags);
+
     // Label category
     const labelCategory = $('<p/>')
       .addClass('b-300 mr-4')
@@ -510,6 +520,17 @@ export default class AdBreakTab extends BaseAnalysisTab {
           }
         });
       }
+
+      // tags
+      if ((x.tags || []).length > 0) {
+        x.tags.forEach((item) => {
+          if (item.text) {
+            const li = $('<li/>')
+              .append(`${item.text} (${item.score}%)`);
+            ulTags.append(li);
+          }
+        });
+      }
     });
 
     contextual.forEach((x) => {

diff --git a/...lib/js/app/mainView/collection/base/components/analysis/rekognition/image/imageCaption.js b/...lib/js/app/mainView/collection/base/components/analysis/rekognition/image/imageCaption.js
@@ -1,9 +1,29 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+import SolutionManifest from '/solution-manifest.js';
 import Localization from '../../../../../../../shared/localization.js';
 import BaseAnalysisTab from '../../base/baseAnalysisTab.js';
 
+const {
+  FoundationModels = [],
+} = SolutionManifest;
+
+const {
+  name: MODEL_NAME = '',
+  value: MODEL_ID = '',
+} = FoundationModels[0] || {};
+
+const MODEL_PRICING = (MODEL_ID.indexOf('sonnet') > 0)
+  ? {
+    InputTokens: 0.00300,
+    OutputTokens: 0.01500,
+  }
+  : {
+    InputTokens: 0.00025,
+    OutputTokens: 0.00125,
+  };
+
 const {
   Messages: {
     ImageCaptionTab: TITLE,
@@ -46,6 +66,10 @@ export default class ImageCaptionTab extends BaseAnalysisTab {
       output = JSON.parse(output);
 
       const {
+        usage: {
+          inputTokens,
+          outputTokens,
+        },
         description,
         altText,
         fileName,
@@ -92,6 +116,16 @@ export default class ImageCaptionTab extends BaseAnalysisTab {
           }
         }
       });
+
+      // usage
+      const estimatedCost = ((
+        (inputTokens * MODEL_PRICING.InputTokens) +
+        (outputTokens * MODEL_PRICING.OutputTokens)
+      ) / 1000).toFixed(4);
+
+      const p = $('<p/>')
+        .append(`(Total of <code>${inputTokens}</code> input tokens and <code>${outputTokens}</code> output tokens using ${MODEL_NAME}. Estimated code is <code>$${estimatedCost}</code>.)`);
+      container.append(p);
     }
 
     return container;

diff --git a/...c/lib/js/app/mainView/collection/base/components/analysis/rekognition/video/segmentTab.js b/...c/lib/js/app/mainView/collection/base/components/analysis/rekognition/video/segmentTab.js
@@ -354,6 +354,19 @@ export default class SegmentTab extends BaseRekognitionTab {
             .addClass('lead-s b-300');
           sectionBrandAndLogos.append(ulBrandAndLogos);
 
+          // tags
+          const sectionTags = $('<section/>');
+          sceneDescView.append(sectionTags);
+
+          desc = $('<p/>')
+            .addClass('b-400 mr-4')
+            .append('Top 5 relevant tags');
+          sectionTags.append(desc);
+
+          const ulTags = $('<ul/>')
+            .addClass('lead-s b-300');
+          sectionTags.append(ulTags);
+
           item.details.forEach((x) => {
             let li;
             if ((x.description || {}).text) {
@@ -393,6 +406,17 @@ export default class SegmentTab extends BaseRekognitionTab {
                 }
               });
             }
+
+            // tags
+            if ((x.tags || []).length > 0) {
+              x.tags.forEach((_item) => {
+                if (_item.text) {
+                  li = $('<li/>')
+                    .append(`${_item.text} (${_item.score}%)`);
+                  ulTags.append(li);
+                }
+              });
+            }
           });
         }