examples : add "command" tool (ggerganov#171)

ggerganov · ggerganov · commit bc88eb13c641 · 2022-11-25T19:36:57.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ build-sanitize-thread/
 
 main
 stream
+command
 bench
 sync.sh
 compile_commands.json
diff --git a/Makefile b/Makefile
@@ -134,7 +134,7 @@ libwhisper.so: ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
 
 clean:
-	rm -f *.o main stream bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command bench libwhisper.a libwhisper.so
 
 #
 # Examples
@@ -149,6 +149,9 @@ main: examples/main/main.cpp ggml.o whisper.o
 stream: examples/stream/stream.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
 
+command: examples/command/command.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
 
diff --git a/README.md b/README.md
@@ -98,26 +98,27 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...
 
 options:
-  -h,       --help           show this help message and exit
-  -t N,     --threads N      number of threads to use during computation (default: 4)
-  -p N,     --processors N   number of processors to use during computation (default: 1)
-  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
-  -on N,    --offset-n N     segment index offset (default: 0)
-  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
-  -ml N,    --max-len N      maximum segment length in characters (default: 0)
-  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
-  -v,       --verbose        verbose output
-            --translate      translate from source language to english
-  -otxt,    --output-txt     output result in a text file
-  -ovtt,    --output-vtt     output result in a vtt file
-  -osrt,    --output-srt     output result in a srt file
-  -owts,    --output-words   output script for generating karaoke video
-  -ps,      --print_special  print special tokens
-  -pc,      --print_colors   print colors
-  -nt,      --no_timestamps  do not print timestamps
-  -l LANG,  --language LANG  spoken language (default: en)
-  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
-  -f FNAME, --file FNAME     input WAV file path
+  -h,       --help          [default] show this help message and exit
+  -t N,     --threads N     [4      ] number of threads to use during computation
+  -p N,     --processors N  [1      ] number of processors to use during computation
+  -ot N,    --offset-t N    [0      ] time offset in milliseconds
+  -on N,    --offset-n N    [0      ] segment index offset
+  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N     [0      ] maximum segment length in characters
+  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
+  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate     [false  ] translate from source language to english
+  -otxt,    --output-txt    [false  ] output result in a text file
+  -ovtt,    --output-vtt    [false  ] output result in a vtt file
+  -osrt,    --output-srt    [false  ] output result in a srt file
+  -owts,    --output-words  [false  ] output script for generating karaoke video
+  -ps,      --print-special [false  ] print special tokens
+  -pc,      --print-colors  [false  ] print colors
+  -nt,      --no-timestamps [true   ] do not print timestamps
+  -l LANG,  --language LANG [en     ] spoken language
+  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME    [       ] input WAV file path
 
 bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
@@ -149,13 +150,13 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
-whisper_model_load: mem_required  = 670.00 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: ggml ctx size = 140.60 MB
-whisper_model_load: memory size =    22.83 MB
-whisper_model_load: model size  =   140.54 MB
+whisper_model_load: mem_required  =  506.00 MB
+whisper_model_load: ggml ctx size =  140.60 MB
+whisper_model_load: memory size   =   22.83 MB
+whisper_model_load: model size    =  140.54 MB
 
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 
 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -24,5 +24,6 @@ if (EMSCRIPTEN)
 else()
     add_subdirectory(main)
     add_subdirectory(stream)
+    add_subdirectory(command)
     add_subdirectory(bench)
 endif()
diff --git a/examples/command/CMakeLists.txt b/examples/command/CMakeLists.txt
@@ -0,0 +1,7 @@
+if (WHISPER_SUPPORT_SDL2)
+    # command
+    set(TARGET command)
+    add_executable(${TARGET} command.cpp)
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endif ()
diff --git a/examples/command/README.md b/examples/command/README.md
@@ -0,0 +1,26 @@
+# command
+
+This is a basic Voice Assistant example that accepts voice commands from the microphone.
+More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
+
+```java
+# Run with default arguments and small model
+./command -m ./models/ggml-small.en.bin -t 8
+
+# On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
+./bin/command -m ../models/ggml-tiny.en.bin -ac 768
+```
+
+## Building
+
+The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
+
+```bash
+# Install SDL2 on Linux
+sudo apt-get install libsdl2-dev
+
+# Install SDL2 on Mac OS
+brew install sdl2
+
+make command
+```
diff --git a/examples/command/command.cpp b/examples/command/command.cpp
diff --git a/examples/main/README.md b/examples/main/README.md
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

-Original file line number
+Diff line change
 main
 stream
 +command
 bench
 sync.sh
 compile_commands.json