Skip to content

Commit

Permalink
Merge pull request #2022 from lissyx/expose-metadata
Browse files Browse the repository at this point in the history
Expose extended metadata information to bindings
  • Loading branch information
lissyx authored Apr 24, 2019
2 parents 8f01cca + a9717e7 commit 9815d54
Show file tree
Hide file tree
Showing 24 changed files with 519 additions and 116 deletions.
12 changes: 10 additions & 2 deletions native_client/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ bool has_versions = false;

bool extended_metadata = false;

bool json_output = false;

void PrintHelp(const char* bin)
{
std::cout <<
Expand All @@ -41,7 +43,8 @@ void PrintHelp(const char* bin)
" --trie TRIE Path to the language model trie file created with native_client/generate_trie\n"
" --audio AUDIO Path to the audio file to run (WAV format)\n"
" -t Run in benchmark mode, output mfcc & inference time\n"
" -e Extended output, shows word timings as CSV (word, start time, duration)\n"
" --extended Output string from extended metadata\n"
" --json Extended output, shows word timings as JSON\n"
" --help Show help\n"
" --version Print version and exits\n";
DS_PrintVersions();
Expand All @@ -59,7 +62,8 @@ bool ProcessArgs(int argc, char** argv)
{"audio", required_argument, nullptr, 'w'},
{"run_very_slowly_without_trie_I_really_know_what_Im_doing", no_argument, nullptr, 999},
{"t", no_argument, nullptr, 't'},
{"e", no_argument, nullptr, 'e'},
{"extended", no_argument, nullptr, 'e'},
{"json", no_argument, nullptr, 'j'},
{"help", no_argument, nullptr, 'h'},
{"version", no_argument, nullptr, 'v'},
{nullptr, no_argument, nullptr, 0}
Expand Down Expand Up @@ -110,6 +114,10 @@ bool ProcessArgs(int argc, char** argv)
extended_metadata = true;
break;

case 'j':
json_output = true;
break;

case 'h': // -h or --help
case '?': // Unrecognized option
default:
Expand Down
33 changes: 25 additions & 8 deletions native_client/client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,24 +50,29 @@ struct meta_word {
float duration;
};

char* metadataToString(Metadata* metadata);
std::vector<meta_word> WordsFromMetadata(Metadata* metadata);
char* JSONOutput(Metadata* metadata);

ds_result
LocalDsSTT(ModelState* aCtx, const short* aBuffer, size_t aBufferSize,
int aSampleRate, bool extended_output)
int aSampleRate, bool extended_output, bool json_output)
{
ds_result res = {0};

clock_t ds_start_time = clock();

if (extended_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
res.string = metadataToString(metadata);
DS_FreeMetadata(metadata);
} else if (json_output) {
Metadata *metadata = DS_SpeechToTextWithMetadata(aCtx, aBuffer, aBufferSize, aSampleRate);
res.string = JSONOutput(metadata);
DS_FreeMetadata(metadata);
} else {
res.string = DS_SpeechToText(aCtx, aBuffer, aBufferSize, aSampleRate);
}
}

clock_t ds_end_infer = clock();

Expand Down Expand Up @@ -241,7 +246,8 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
(const short*)audio.buffer,
audio.buffer_size / 2,
audio.sample_rate,
extended_metadata);
extended_metadata,
json_output);
free(audio.buffer);

if (result.string) {
Expand All @@ -255,6 +261,17 @@ ProcessFile(ModelState* context, const char* path, bool show_times)
}
}

char*
metadataToString(Metadata* metadata)
{
std::string retval = "";
for (int i = 0; i < metadata->num_items; i++) {
MetadataItem item = metadata->items[i];
retval += item.character;
}
return strdup(retval.c_str());
}

std::vector<meta_word>
WordsFromMetadata(Metadata* metadata)
{
Expand All @@ -274,16 +291,16 @@ WordsFromMetadata(Metadata* metadata)
}

// Word boundary is either a space or the last character in the array
if (strcmp(item.character, " ") == 0
|| strcmp(item.character, u8" ") == 0
if (strcmp(item.character, " ") == 0
|| strcmp(item.character, u8" ") == 0
|| i == metadata->num_items-1) {

float word_duration = item.start_time - word_start_time;

if (word_duration < 0) {
word_duration = 0;
}

meta_word w;
w.word = word;
w.start_time = word_start_time;
Expand Down
38 changes: 20 additions & 18 deletions native_client/dotnet/DeepSpeech.sln
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.136
Expand All @@ -8,21 +7,24 @@ EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DeepSpeechConsole", "DeepSpeechConsole\DeepSpeechConsole.csproj", "{312965E5-C4F6-4D95-BA64-79906B8BC7AC}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.ActiveCfg = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Debug|x64.Build.0 = Debug|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.ActiveCfg = Release|x64
{56DE4091-BBBE-47E4-852D-7268B33B971F}.Release|x64.Build.0 = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.ActiveCfg = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Debug|x64.Build.0 = Debug|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.ActiveCfg = Release|x64
{312965E5-C4F6-4D95-BA64-79906B8BC7AC}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {FC035D95-DBFD-4050-885A-A2DD9134B3AD}
EndGlobalSection
EndGlobal
62 changes: 47 additions & 15 deletions native_client/dotnet/DeepSpeechClient/DeepSpeech.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
using DeepSpeechClient.Interfaces;
using DeepSpeechClient.Structs;
using DeepSpeechClient.Extensions;

using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;

namespace DeepSpeechClient
{
Expand All @@ -16,7 +17,7 @@ public class DeepSpeech : IDeepSpeech
private unsafe ModelState* _modelStateP;
private unsafe StreamingState** _streamingStatePP;




public DeepSpeech()
Expand Down Expand Up @@ -119,7 +120,7 @@ public unsafe int EnableDecoderWithLM(string aAlphabetConfigPath,
/// <summary>
/// Feeds audio samples to an ongoing streaming inference.
/// </summary>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
/// <param name="aBuffer">An array of 16-bit, mono raw audio samples at the appropriate sample rate.</param>
public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
{
NativeImp.DS_FeedAudioContent(_streamingStatePP, aBuffer, aBufferSize);
Expand All @@ -131,11 +132,20 @@ public unsafe void FeedAudioContent(short[] aBuffer, uint aBufferSize)
/// <returns>The STT result. The user is responsible for freeing the string.</returns>
public unsafe string FinishStream()
{
return NativeImp.DS_FinishStream(_streamingStatePP);
return NativeImp.DS_FinishStream(_streamingStatePP).PtrToString();
}

/// <summary>
/// Closes the ongoing streaming inference, returns the STT result over the whole audio signal.
/// </summary>
/// <returns>The extended metadata. The user is responsible for freeing the struct.</returns>
public unsafe Models.Metadata FinishStreamWithMetadata()
{
return NativeImp.DS_FinishStreamWithMetadata(_streamingStatePP).PtrToMetadata();
}

/// <summary>
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
/// Computes the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't
/// currently capable of streaming, so it always starts from the beginning of the audio.
/// </summary>
/// <returns>The STT intermediate result. The user is responsible for freeing the string.</returns>
Expand All @@ -156,7 +166,7 @@ public unsafe void PrintVersions()
/// Creates a new streaming inference state.
/// </summary>
/// <param name="aPreAllocFrames">Number of timestep frames to reserve.
/// One timestep is equivalent to two window lengths(20ms).
/// One timestep is equivalent to two window lengths(20ms).
/// If set to 0 we reserve enough frames for 3 seconds of audio(150).</param>
/// <param name="aSampleRate">The sample-rate of the audio signal</param>
/// <returns>Zero for success, non-zero on failure</returns>
Expand All @@ -166,7 +176,7 @@ public unsafe int SetupStream(uint aPreAllocFrames, uint aSampleRate)
}

/// <summary>
/// Destroy a streaming state without decoding the computed logits.
/// Destroy a streaming state without decoding the computed logits.
/// This can be used if you no longer need the result of an ongoing streaming
/// inference and don't want to perform a costly decode operation.
/// </summary>
Expand All @@ -175,6 +185,22 @@ public unsafe void DiscardStream()
NativeImp.DS_DiscardStream(ref _streamingStatePP);
}

/// <summary>
/// Free a DeepSpeech allocated string
/// </summary>
public unsafe void FreeString(IntPtr intPtr)
{
NativeImp.DS_FreeString(intPtr);
}

/// <summary>
/// Free a DeepSpeech allocated Metadata struct
/// </summary>
public unsafe void FreeMetadata(IntPtr intPtr)
{
NativeImp.DS_FreeMetadata(intPtr);
}

/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
Expand All @@ -184,18 +210,24 @@ public unsafe void DiscardStream()
/// <returns>The STT result. The user is responsible for freeing the string. Returns NULL on error.</returns>
public unsafe string SpeechToText(short[] aBuffer, uint aBufferSize, uint aSampleRate)
{
var res = NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate);

int len = 0;
while (Marshal.ReadByte(res, len) != 0) ++len;
byte[] buffer = new byte[len];
Marshal.Copy(res, buffer, 0, buffer.Length);
return Encoding.UTF8.GetString(buffer);
return NativeImp.DS_SpeechToText(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToString();
}

/// <summary>
/// Use the DeepSpeech model to perform Speech-To-Text.
/// </summary>
/// <param name="aBuffer">A 16-bit, mono raw audio signal at the appropriate sample rate.</param>
/// <param name="aBufferSize">The number of samples in the audio signal.</param>
/// <param name="aSampleRate">The sample-rate of the audio signal.</param>
/// <returns>The extended metadata. The user is responsible for freeing the struct. Returns NULL on error.</returns>
public unsafe Models.Metadata SpeechToTextWithMetadata(short[] aBuffer, uint aBufferSize, uint aSampleRate)
{
return NativeImp.DS_SpeechToTextWithMetadata(_modelStatePP, aBuffer, aBufferSize, aSampleRate).PtrToMetadata();
}

#endregion



}
}
24 changes: 5 additions & 19 deletions native_client/dotnet/DeepSpeechClient/DeepSpeechClient.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,6 @@
<FileAlignment>512</FileAlignment>
<Deterministic>true</Deterministic>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
Expand Down Expand Up @@ -65,10 +46,15 @@
<ItemGroup>
<Compile Include="DeepSpeech.cs" />
<Compile Include="Interfaces\IDeepSpeech.cs" />
<Compile Include="Extensions\NativeExtensions.cs" />
<Compile Include="Models\Metadata.cs" />
<Compile Include="Models\MetadataItem.cs" />
<Compile Include="NativeImp.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Structs\ModelState.cs" />
<Compile Include="Structs\StreamingState.cs" />
<Compile Include="Structs\Metadata.cs" />
<Compile Include="Structs\MetadataItem.cs" />
</ItemGroup>
<ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
using DeepSpeechClient.Structs;
using System;
using System.Runtime.InteropServices;
using System.Text;

namespace DeepSpeechClient.Extensions
{
internal static class NativeExtensions
{
/// <summary>
/// Converts native pointer to UTF-8 encoded string.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <param name="releasePtr">Optional parameter to release the native pointer.</param>
/// <returns>Result string.</returns>
internal static string PtrToString(this IntPtr intPtr, bool releasePtr = true)
{
int len = 0;
while (Marshal.ReadByte(intPtr, len) != 0) ++len;
byte[] buffer = new byte[len];
Marshal.Copy(intPtr, buffer, 0, buffer.Length);
if (releasePtr)
NativeImp.DS_FreeString(intPtr);
string result = Encoding.UTF8.GetString(buffer);
return result;
}

/// <summary>
/// Converts a pointer into managed metadata object.
/// </summary>
/// <param name="intPtr">Native pointer.</param>
/// <returns>Metadata managed object.</returns>
internal static Models.Metadata PtrToMetadata(this IntPtr intPtr)
{
var managedMetaObject = new Models.Metadata();
var metaData = (Metadata)Marshal.PtrToStructure(intPtr, typeof(Metadata));

managedMetaObject.Items = new Models.MetadataItem[metaData.num_items];
managedMetaObject.Probability = metaData.probability;


//we need to manually read each item from the native ptr using its size
var sizeOfMetaItem = Marshal.SizeOf(typeof(MetadataItem));
for (int i = 0; i < metaData.num_items; i++)
{
var tempItem = Marshal.PtrToStructure<MetadataItem>(metaData.items);
managedMetaObject.Items[i] = new Models.MetadataItem
{
Timestep = tempItem.timestep,
StartTime = tempItem.start_time,
Character = tempItem.character.PtrToString(releasePtr: false)
};
//we keep the offset on each read
metaData.items += sizeOfMetaItem;
}
NativeImp.DS_FreeMetadata(intPtr);
return managedMetaObject;
}
}
}
Loading

0 comments on commit 9815d54

Please sign in to comment.