Merge pull request #2022 from lissyx/expose-metadata

Expose extended metadata information to bindings
mozilla · Apr 24, 2019 · 9815d54 · 9815d54
2 parents 8f01cca + a9717e7
commit 9815d54
Show file tree

Hide file tree

Showing 24 changed files with 519 additions and 116 deletions.
diff --git a/native_client/args.h b/native_client/args.h
@@ -28,6 +28,8 @@ bool has_versions = false;
 
 bool extended_metadata = false;
 
+bool json_output = false;
+
 void PrintHelp(const char* bin)
 {
     std::cout <<
@@ -41,7 +43,8 @@ void PrintHelp(const char* bin)
     "	--trie TRIE		Path to the language model trie file created with native_client/generate_trie\n"
     "	--audio AUDIO		Path to the audio file to run (WAV format)\n"
     "	-t			Run in benchmark mode, output mfcc & inference time\n"
-    "	-e			Extended output, shows word timings as CSV (word, start time, duration)\n"
+    "	--extended		Output string from extended metadata\n"
+    "	--json			Extended output, shows word timings as JSON\n"
     "	--help			Show help\n"
     "	--version		Print version and exits\n";
     DS_PrintVersions();
@@ -59,7 +62,8 @@ bool ProcessArgs(int argc, char** argv)
             {"audio", required_argument, nullptr, 'w'},
             {"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
             {"t", no_argument, nullptr, 't'},
-            {"e", no_argument, nullptr, 'e'},
+            {"extended", no_argument, nullptr, 'e'},
+            {"json", no_argument, nullptr, 'j'},
             {"help", no_argument, nullptr, 'h'},
             {"version", no_argument, nullptr, 'v'},
             {nullptr, no_argument, nullptr, 0}
@@ -110,6 +114,10 @@ bool ProcessArgs(int argc, char** argv)
             extended_metadata = true;
             break;
 
+        case 'j':
+            json_output = true;
+            break;
+
         case 'h': // -h or --help
         case '?': // Unrecognized option
         default:

diff --git a/native_client/client.cc b/native_client/client.cc
@@ -50,24 +50,29 @@ struct meta_word {
   float duration;
 };
 
+char* metadataToString(Metadata* metadata);
 std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
 char* JSONOutput(Metadata* metadata);
 
 ds_result
 LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
-           int aSampleRate, bool extended_output)
+           int aSampleRate, bool extended_output, bool json_output)
 {
   ds_result res = {0};
 
   clock_t ds_start_time = clock();
 
   if (extended_output) {
+    Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
+    res.string = metadataToString(metadata);
+    DS_FreeMetadata(metadata);
+  } else if (json_output) {
     Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
     res.string = JSONOutput(metadata);
     DS_FreeMetadata(metadata);
   } else {
     res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
-  }  
+  }
 
   clock_t ds_end_infer = clock();
 
@@ -241,7 +246,8 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
                                 (const short*)audio.buffer,
                                 audio.buffer_size / 2,
                                 audio.sample_rate,
-                                extended_metadata);
+                                extended_metadata,
+                                json_output);
   free(audio.buffer);
 
   if (result.string) {
@@ -255,6 +261,17 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
   }
 }
 
+char*
+metadataToString(Metadata* metadata)
+{
+  std::string retval = "";
+  for (int i = 0; i < metadata->num_items; i++) {
+    MetadataItem item = metadata->items[i];
+    retval += item.character;
+  }
+  return strdup(retval.c_str());
+}
+
 std::vector<meta_word>
 WordsFromMetadata(Metadata* metadata)
 {
@@ -274,16 +291,16 @@ WordsFromMetadata(Metadata* metadata)
     }
 
     // Word boundary is either a space or the last character in the array
-    if (strcmp(item.character, " ") == 0 
-        || strcmp(item.character, u8" ") == 0 
+    if (strcmp(item.character, " ") == 0
+        || strcmp(item.character, u8" ") == 0
         || i == metadata->num_items-1) {
-        
+
       float word_duration = item.start_time - word_start_time;
-      
+
       if (word_duration < 0) {
         word_duration = 0;
       }
-      
+
       meta_word w;
       w.word = word;
       w.start_time = word_start_time;

diff --git a/native_client/dotnet/DeepSpeech.sln b/native_client/dotnet/DeepSpeech.sln
@@ -1,4 +1,3 @@
-
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
 VisualStudioVersion = 15.0.28307.136
@@ -8,21 +7,24 @@ EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechConsole", "DeepSpeechConsole\DeepSpeechConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}"
 EndProject
 Global
-    GlobalSection(SolutionConfigurationPlatforms) = preSolution
-        Debug|x64 = Debug|x64
-        Release|x64 = Release|x64
-    EndGlobalSection
-    GlobalSection(ProjectConfigurationPlatforms) = postSolution
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
-        {56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
-        {312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
-    EndGlobalSection
-    GlobalSection(SolutionProperties) = preSolution
-        HideSolutionNode = FALSE
-    EndGlobalSection
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
+		{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
+		{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {FC035D95-DBFD-4050-885A-A2DD9134B3AD}
+	EndGlobalSection
 EndGlobal
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs b/native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
@@ -1,9 +1,10 @@
 using DeepSpeechClient.Interfaces;
 using DeepSpeechClient.Structs;
+using DeepSpeechClient.Extensions;
+
 using System;
 using System.IO;
 using System.Runtime.InteropServices;
-using System.Text;
 
 namespace DeepSpeechClient
 {
@@ -16,7 +17,7 @@ public class DeepSpeech : IDeepSpeech
         private unsafe ModelState* _modelStateP;
         private unsafe StreamingState** _streamingStatePP;
 
-        
+
 
 
         public DeepSpeech()
@@ -119,7 +120,7 @@ public unsafe int EnableDecoderWithLM(string aAlphabetConfigPath,
         /// <summary>
         /// Feeds audio samples to an ongoing streaming inference.
         /// </summary>
-        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param> 
+        /// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
         public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
         {
             NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
@@ -131,11 +132,20 @@ public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
         /// <returns>The STT result. The user is responsible for freeing the string.</returns>
         public unsafe string FinishStream()
         {
-            return NativeImp.DS_FinishStream(_streamingStatePP);
+            return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString();
+        }
+
+        /// <summary>
+        /// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
+        /// </summary>
+        /// <returns>The extended metadata. The user is responsible for freeing the struct.</returns>
+        public unsafe Models.Metadata FinishStreamWithMetadata()
+        {
+            return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata();
         }
 
         /// <summary>
-        /// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't 
+        /// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
         /// currently capable of streaming, so it always starts from the beginning of the audio.
         /// </summary>
         /// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
@@ -156,7 +166,7 @@ public unsafe void PrintVersions()
         /// Creates a new streaming inference state.
         /// </summary>
         /// <param name="aPreAllocFrames">Number of timestep frames to reserve.
-        /// One timestep is equivalent to two window lengths(20ms). 
+        /// One timestep is equivalent to two window lengths(20ms).
         /// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
         /// <param name="aSampleRate">The sample-rate of the audio signal</param>
         /// <returns>Zero for success, non-zero on failure</returns>
@@ -166,7 +176,7 @@ public unsafe int SetupStream(uint aPreAllocFrames, uint aSampleRate)
         }
 
         /// <summary>
-        /// Destroy a streaming state without decoding the computed logits. 
+        /// Destroy a streaming state without decoding the computed logits.
         /// This can be used if you no longer need the result of an ongoing streaming
         /// inference and don't want to perform a costly decode operation.
         /// </summary>
@@ -175,6 +185,22 @@ public unsafe void DiscardStream()
             NativeImp.DS_DiscardStream(ref _streamingStatePP);
         }
 
+        /// <summary>
+        /// Free a DeepSpeech allocated string
+        /// </summary>
+        public unsafe void FreeString(IntPtr intPtr)
+        {
+            NativeImp.DS_FreeString(intPtr);
+        }
+
+        /// <summary>
+        /// Free a DeepSpeech allocated Metadata struct
+        /// </summary>
+        public unsafe void FreeMetadata(IntPtr intPtr)
+        {
+            NativeImp.DS_FreeMetadata(intPtr);
+        }
+
         /// <summary>
         /// Use the DeepSpeech model to perform Speech-To-Text.
         /// </summary>
@@ -184,18 +210,24 @@ public unsafe void DiscardStream()
         /// <returns>The STT result. The user is responsible for freeing the string.  Returns NULL on error.</returns>
         public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
         {
-            var res = NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate);
-
-            int len = 0;
-            while (Marshal.ReadByte(res, len) != 0) ++len;
-            byte[] buffer = new byte[len];
-            Marshal.Copy(res, buffer, 0, buffer.Length);
-            return Encoding.UTF8.GetString(buffer);
+            return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
+        }
+
+        /// <summary>
+        /// Use the DeepSpeech model to perform Speech-To-Text.
+        /// </summary>
+        /// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
+        /// <param name="aBufferSize">The number of samples in the audio signal.</param>
+        /// <param name="aSampleRate">The sample-rate of the audio signal.</param>
+        /// <returns>The extended metadata. The user is responsible for freeing the struct.  Returns NULL on error.</returns>
+        public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
+        {
+            return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
         }
 
         #endregion
 
 
-        
+
     }
 }
diff --git a/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj b/native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
@@ -13,25 +13,6 @@
     <FileAlignment>512</FileAlignment>
     <Deterministic>true</Deterministic>
   </PropertyGroup>
-  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
-    <DebugSymbols>true</DebugSymbols>
-    <DebugType>full</DebugType>
-    <Optimize>false</Optimize>
-    <OutputPath>bin\Debug\</OutputPath>
-    <DefineConstants>DEBUG;TRACE</DefineConstants>
-    <ErrorReport>prompt</ErrorReport>
-    <WarningLevel>4</WarningLevel>
-    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-  </PropertyGroup>
-  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
-    <DebugType>pdbonly</DebugType>
-    <Optimize>true</Optimize>
-    <OutputPath>bin\Release\</OutputPath>
-    <DefineConstants>TRACE</DefineConstants>
-    <ErrorReport>prompt</ErrorReport>
-    <WarningLevel>4</WarningLevel>
-    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-  </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
     <DebugSymbols>true</DebugSymbols>
     <OutputPath>bin\x64\Debug\</OutputPath>
@@ -65,10 +46,15 @@
   <ItemGroup>
     <Compile Include="DeepSpeech.cs" />
     <Compile Include="Interfaces\IDeepSpeech.cs" />
+    <Compile Include="Extensions\NativeExtensions.cs" />
+    <Compile Include="Models\Metadata.cs" />
+    <Compile Include="Models\MetadataItem.cs" />
     <Compile Include="NativeImp.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
     <Compile Include="Structs\ModelState.cs" />
     <Compile Include="Structs\StreamingState.cs" />
+    <Compile Include="Structs\Metadata.cs" />
+    <Compile Include="Structs\MetadataItem.cs" />
   </ItemGroup>
   <ItemGroup />
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

diff --git a/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs b/native_client/dotnet/DeepSpeechClient/Extensions/NativeExtensions.cs
@@ -0,0 +1,60 @@
+using DeepSpeechClient.Structs;
+using System;
+using System.Runtime.InteropServices;
+using System.Text;
+
+namespace DeepSpeechClient.Extensions
+{
+    internal static class NativeExtensions
+    {
+        /// <summary>
+        /// Converts native pointer to UTF-8 encoded string.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <param name="releasePtr">Optional parameter to release the native pointer.</param>
+        /// <returns>Result string.</returns>
+        internal static string PtrToString(this IntPtr intPtr, bool releasePtr = true)
+        {
+            int len = 0;
+            while (Marshal.ReadByte(intPtr, len) != 0) ++len;
+            byte[] buffer = new byte[len];
+            Marshal.Copy(intPtr, buffer, 0, buffer.Length);
+            if (releasePtr)
+                NativeImp.DS_FreeString(intPtr);
+            string result = Encoding.UTF8.GetString(buffer);
+            return result;
+        }
+
+        /// <summary>
+        /// Converts a pointer into managed metadata object.
+        /// </summary>
+        /// <param name="intPtr">Native pointer.</param>
+        /// <returns>Metadata managed object.</returns>
+        internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
+        {
+            var managedMetaObject = new Models.Metadata();
+            var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));
+
+            managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
+            managedMetaObject.Probability = metaData.probability;
+
+
+            //we need to manually read each item from the native ptr using its size
+            var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
+            for (int i = 0; i < metaData.num_items; i++)
+            {
+                var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
+                managedMetaObject.Items[i] = new Models.MetadataItem
+                {
+                    Timestep = tempItem.timestep,
+                    StartTime = tempItem.start_time,
+                    Character = tempItem.character.PtrToString(releasePtr: false)
+                };
+                //we keep the offset on each read
+                metaData.items += sizeOfMetaItem;
+            }
+            NativeImp.DS_FreeMetadata(intPtr);
+            return managedMetaObject;
+        }
+    }
+}