Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transcribe refactor #41

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ downloads
public/javascripts/ffmpeg-core.wasm
.env

constants/apiTokens.txt
constants/apiTokens.txt

*.log
12 changes: 6 additions & 6 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ if (!fs.existsSync('.env')) {

const hourInMilliseconds = 1000 * 60 * 60;

function runDeleteLoop() {
function runDeleteLoop () {
setTimeout(() => {
deleteOldFiles(true);
runDeleteLoop();
}, hourInMilliseconds); // repeat every 1000 milliseconds (1 second)
}

if(process.env.NODE_ENV === 'production'){
if (process.env.NODE_ENV === 'production') {
deleteOldFiles(true);
runDeleteLoop();
}
Expand All @@ -60,10 +60,10 @@ createWebSocketServer(server);

l = console.log;

// l = function(l){
// l = function(l) {
// var stack = (new Error()).stack.split(/\n/);
// // Chrome includes a single "Error" line, FF doesn't.
// if(stack[0].indexOf('Error') === 0){
// if (stack[0].indexOf('Error') === 0) {
// stack = stack.slice(1);
// }
// var args = [].slice.apply(arguments).concat([stack[1].trim()]);
Expand All @@ -79,7 +79,7 @@ fs.mkdirSync('transcriptions', { recursive: true })

// view engine setup
app.set('views', path.join(__dirname, 'views'));
app.set('view engine', 'jade');
app.set('view engine', 'pug');

app.use(favicon(path.join(__dirname,'public','images','favicon.ico')));
app.use(logger('dev'));
Expand All @@ -88,7 +88,7 @@ app.use(bodyParser.urlencoded({ extended: true, limit: '1mb' }));
app.use(cookieParser());
app.use(express.static(path.join(__dirname, 'public')));
// assumes nginx
// if(!isProd){
// if (!isProd) {
app.use(express.static(__dirname));
// }

Expand Down
7 changes: 4 additions & 3 deletions constants/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const languagesArray = whisperLanguagesHumanReadableArray.map(lang => ({value: l
languagesArray.unshift({value: 'auto-detect', name: 'Auto-Detect'});

function getLanguageCodeForAllLanguages (languageName) {
// l(Object.values(languageNameMap));
let foundLanguageCode;
Object.keys(languageNameMap).forEach(languageCode =>{
if (languageNameMap[languageCode].name === languageName) {
Expand Down Expand Up @@ -67,7 +68,7 @@ const translationLanguages = [
{'code':'uk','name':'Ukranian'}
];

const languagesToTranslateTo = [
const targetLanguages = [
// {"code":"ar","name":"Arabic"}, // haven't got these two to work
// {"code":"zh","name":"Chinese"}, // webvtt format is too broken after translate
{'code':'en','name':'English'},
Expand Down Expand Up @@ -98,7 +99,7 @@ const languagesToTranscribe = [
// }

function shouldTranslateFrom (languageName) {
return languagesToTranslateTo.includes(languageName);
return targetLanguages.includes(languageName);
}

let newLanguagesMap = [];
Expand Down Expand Up @@ -135,6 +136,6 @@ module.exports = {
newLanguagesMap,
allLanguages,
modelsArray,
languagesToTranslateTo,
targetLanguages,
whisperLanguagesHumanReadableArray
}
3 changes: 2 additions & 1 deletion downloading/download.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ l = console.log;

// async usage
// rejects if not found
const ytDlpBinaryPath = which.sync('yt-dlp');
const ytDlpName = process.platform === 'win32' ? 'YoutubeDL' : 'yt-dlp';
const ytDlpBinaryPath = which.sync(ytDlpName);

const ytDlpWrap = new YTDlpWrap(ytDlpBinaryPath);

Expand Down
19 changes: 8 additions & 11 deletions downloading/yt-dlp-download.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@ const spawn = require('child_process').spawn;
const fs = require('fs-extra');
const projectConstants = require('../constants/constants');
const {formatStdErr} = require('../helpers/formatStdErr');
const { generateRandomNumber } = require('../helpers/helpers');

// yt-dlp --no-mtime -f '\''bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best'\''

const l = console.log;

const ytDlpPath = which.sync('yt-dlp')
const ytDlpName = process.platform === 'win32' ? 'YoutubeDL' : 'yt-dlp';
const ytDlpPath = which.sync(ytDlpName);

// get data from youtube-dlp stdout string
function extractDataFromString(string){
function extractDataFromString (string) {
const percentDownloaded = parseInt(string.match(/(\d+\.?\d*)%/)[1]);
const totalFileSize = string.match(/of\s+(.*?)\s+at/)[1];
const downloadSpeed = string.match(/at\s+(.*?)\s+ETA/)[1];
Expand All @@ -29,9 +30,9 @@ function extractDataFromString(string){
}

// delete from transcription array (used to get rid of the yt-dlp process)
function deleteFromGlobalTranscriptionsBasedOnWebsocketNumber(websocketNumber) {
function deleteFromGlobalTranscriptionsBasedOnWebsocketNumber (websocketNumber) {
// check for websocket number and type
function matchDownloadProcessByWebsocketNumber(transcriptionProcess){
function matchDownloadProcessByWebsocketNumber (transcriptionProcess) {
return transcriptionProcess.websocketNumber === websocketNumber && transcriptionProcess.type === 'download';
}

Expand Down Expand Up @@ -64,7 +65,7 @@ async function downloadFile ({
l(latestDownloadInfo);

// only run if ETA is in the string
if(!latestDownloadInfo.includes('ETA')) return
if (!latestDownloadInfo.includes('ETA')) return

const { percentDownloaded, totalFileSize, downloadSpeed, fileSizeUnit, fileSizeValue } = extractDataFromString(latestDownloadInfo);

Expand Down Expand Up @@ -154,7 +155,7 @@ async function downloadFileApi ({
l(latestDownloadInfo);

// only run if ETA is in the string
if(!latestDownloadInfo.includes('ETA')) return
if (!latestDownloadInfo.includes('ETA')) return

const { percentDownloaded, totalFileSize, downloadSpeed, fileSizeUnit, fileSizeValue } = extractDataFromString(latestDownloadInfo);
currentPercentDownload = percentDownloaded;
Expand Down Expand Up @@ -247,10 +248,6 @@ async function getFilename (videoUrl) {

const testUrl = 'https://www.youtube.com/watch?v=wnhvanMdx4s';

function generateRandomNumber () {
return Math.floor(Math.random() * 10000000000).toString();
}

const randomNumber = generateRandomNumber();

async function main () {
Expand Down
94 changes: 35 additions & 59 deletions helpers/formatStdErr.js
Original file line number Diff line number Diff line change
@@ -1,62 +1,38 @@
const l = console.log;

const ten = ' 10%|█ | 5332/52135 [00:10<01:25, 545.77frames/s]';

function formatStdErr (stdErrData) {
// if a progress output
if (stdErrData.includes('frames/s')) {
// looks like: '█ '
const progressBar = stdErrData.split('|')[1].split('|')[0]

// looks like: '10%'
let percentDone = stdErrData.split('|')[0].trim();

// looks like: 10
let percentDoneAsNumber = Number(stdErrData.split('%')[0].trim());

// looks like: '00:10<01:25, 545.77frames/s]'
let timeLeftPortion = stdErrData.split('[')[1].split('[')[0]

// looks like: '00:10<01:25'
const firstPortion = timeLeftPortion.split(',')[0]

// looks like: '00:10'
const timeElapsed = firstPortion.split('<')[0]

// looks like: '01:25'
const timeRemainingString = timeLeftPortion.split('<')[1].split(',')[0]

// looks like: '545.77'
const speed = timeLeftPortion.split('<')[1].split(',')[1].split('frames')[0].trim()

// looks like: '545.77'
const splitTimeRemaining = timeRemainingString.split(':')

// looks like: '01'
const secondsRemaining = Number(splitTimeRemaining.pop());

// looks like: '25'
const minutesRemaining = Number(splitTimeRemaining.pop());

// looks like: 'NaN'
const hoursRemaining = Number(splitTimeRemaining.pop());

// format for lib
return {
progressBar,
percentDone,
timeElapsed,
speed,
percentDoneAsNumber,
timeRemaining: {
string: timeRemainingString,
hoursRemaining,
minutesRemaining,
secondsRemaining
},
}
} else {
return false
// example of stdErrData: ' 10%|█ | 5332/52135 [00:10<01:25, 545.77frames/s]'

const formatStdErr = stdErrData => {
// a cleaner and more concise approach
const dataRegex = /^\D*((\d+)%)\|([\u2580-\u2590\s]+)\|\s*\d+\/\d+\s\[(\d\d:\d\d)<((?:(\d\d):)?(\d\d):(\d\d)|\?),\s*(\d+\.\d\d|\?)frames\/s\]/;

// if not a progress output
if (!dataRegex.test(stdErrData)) return false;

const [
wholeMatch,
percentDone, // looks like: '10%'
percentDoneAsNumber, // looks like: '10'
progressBar, // looks like: '█ '
timeElapsed, // looks like: '00:10'
timeRemaining, // looks like: '01:25'
hoursRemaining, // looks like: 'undefined'
minutesRemaining, // looks like: '25'
secondsRemaining, // looks like: '01'
speed // looks like: '545.77'
] = stdErrData.match(dataRegex);

// format for lib
return {
progressBar,
percentDone,
timeElapsed,
speed,
percentDoneAsNumber: +percentDoneAsNumber,
timeRemaining: {
string: timeRemaining,
hoursRemaining: +hoursRemaining,
minutesRemaining: +minutesRemaining,
secondsRemaining: +secondsRemaining
},
}
}

Expand Down
23 changes: 20 additions & 3 deletions helpers/helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ function forHumansHoursAndMinutes ( seconds ) {
return returntext.trim();
}



const decrementBySecond = timeRemainingValues => {
let {secondsRemaining, minutesRemaining, hoursRemaining} = timeRemainingValues;

Expand Down Expand Up @@ -86,9 +84,28 @@ const decrementBySecond = timeRemainingValues => {
}
}

const getamountOfRunningJobs = () =>
Object.values(global.jobProcesses)
.filter(propValue => propValue !== undefined)
.length;

const sendToWebsocket = (websocketConnection, data) => {
websocketConnection.send(JSON.stringify(data), function () {});
}

// TODO: not the world's greatest implemention
const generateRandomNumber = () => Math.floor(Math.random() * 10_000_000_000).toString();

const toTitleCase = str => !str || !str.trim() ? str
: str.toLowerCase().replace(/\b[a-z]/g, ltr => ltr.toUpperCase());

module.exports = {
forHumans,
forHumansNoSeconds,
decrementBySecond,
forHumansHoursAndMinutes
forHumansHoursAndMinutes,
getamountOfRunningJobs,
sendToWebsocket,
generateRandomNumber,
toTitleCase,
}
46 changes: 14 additions & 32 deletions lib/convertText.js
Original file line number Diff line number Diff line change
@@ -1,41 +1,23 @@
const fs = require('fs-extra');
const convert = require('cyrillic-to-latin');
const { simplified } = require('zh-convert');
const {simplified} = require('zh-convert');

async function convertSerbianCyrillicToLatin ({
transcribedSrtFilePath,
transcribedVttFilePath,
transcribedTxtFilePath,
}) {
let data = await fs.readFile(transcribedSrtFilePath, 'utf-8');
let latinCharactersText = convert(data);
await fs.writeFile(transcribedSrtFilePath, latinCharactersText, 'utf-8');
const fileTypes = ['srt', 'vtt', 'txt'];

data = await fs.readFile(transcribedVttFilePath, 'utf-8');
latinCharactersText = convert(data);
await fs.writeFile(transcribedVttFilePath, latinCharactersText, 'utf-8');

data = await fs.readFile(transcribedTxtFilePath, 'utf-8');
latinCharactersText = convert(data);
await fs.writeFile(transcribedTxtFilePath, latinCharactersText, 'utf-8');
const convertSerbianCyrillicToLatin = async path => {
fileTypes.forEach(async fileType => {
const data = await fs.readFile(`${path}.${fileType}`, 'utf-8');
const latinCharactersText = convert(data);
await fs.writeFile(`${path}.${fileType}`, latinCharactersText, 'utf-8');
});
}

async function convertChineseTraditionalToSimplified ({
transcribedSrtFilePath,
transcribedVttFilePath,
transcribedTxtFilePath,
}) {
let data = await fs.readFile(transcribedSrtFilePath, 'utf-8');
let simplifiedText = simplified(data);
await fs.writeFile(transcribedSrtFilePath, simplifiedText, 'utf-8');

data = await fs.readFile(transcribedVttFilePath, 'utf-8');
simplifiedText = simplified(data);
await fs.writeFile(transcribedVttFilePath, simplifiedText, 'utf-8');

data = await fs.readFile(transcribedTxtFilePath, 'utf-8');
simplifiedText = simplified(data);
await fs.writeFile(transcribedTxtFilePath, simplifiedText, 'utf-8');
const convertChineseTraditionalToSimplified = async path => {
fileTypes.forEach(async fileType => {
const data = await fs.readFile(`${path}.${fileType}`, 'utf-8');
const simplifiedText = simplified(data);
await fs.writeFile(`${path}.${fileType}`, simplifiedText, 'utf-8');
});
}

module.exports = {
Expand Down
Loading