fix: video language detection fix (#309)

* fix: video language detection fix * address requested changes * fix: predictVideoLanguage function
Joystream · Mar 8, 2024 · 8e5b98b · 8e5b98b
1 parent 3127350
commit 8e5b98b
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 24 deletions.
diff --git a/src/mappings/content/video.ts b/src/mappings/content/video.ts
@@ -1,3 +1,4 @@
+import { generateAppActionCommitment } from '@joystream/js/utils'
 import {
   AppAction,
   AppActionMetadata,
@@ -8,13 +9,14 @@ import { DecodedMetadataObject } from '@joystream/metadata-protobuf/types'
 import { integrateMeta } from '@joystream/metadata-protobuf/utils'
 import {
   Channel,
-  Video,
-  VideoViewEvent,
   Event,
+  Video,
   VideoCreatedEventData,
   VideoPosted,
+  VideoViewEvent,
 } from '../../model'
 import { EventHandlerContext } from '../../utils/events'
+import { predictVideoLanguage } from '../../utils/language'
 import { deserializeMetadata, u8aToBytes, videoRelevanceManager } from '../utils'
 import { processVideoMetadata } from './metadata'
 import {
@@ -26,8 +28,6 @@ import {
   processAppActionMetadata,
   processNft,
 } from './utils'
-import { generateAppActionCommitment } from '@joystream/js/utils'
-import { predictLanguage } from '../../utils/language'
 
 export async function processVideoCreatedEvent({
   overlay,
@@ -115,8 +115,10 @@ export async function processVideoCreatedEvent({
     }
   }
 
-  const languageText = [video.title ?? '', video.description ?? ''].join(' ')
-  video.orionLanguage = predictLanguage(languageText)
+  video.orionLanguage = predictVideoLanguage({
+    title: video.title ?? '',
+    description: video.description ?? '',
+  })
 
   channel.totalVideosCreated += 1
 
@@ -183,8 +185,10 @@ export async function processVideoUpdatedEvent({
     )
   }
 
-  const languageText = [video.title ?? '', video.description ?? ''].join(' ')
-  video.orionLanguage = predictLanguage(languageText)
+  video.orionLanguage = predictVideoLanguage({
+    title: video.title ?? '',
+    description: video.description ?? '',
+  })
 
   if (autoIssueNft) {
     await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft)

diff --git a/src/utils/customMigrations/setOrionLanguage.ts b/src/utils/customMigrations/setOrionLanguage.ts
@@ -2,7 +2,7 @@ import { createLogger } from '@subsquid/logger'
 import { IsNull } from 'typeorm'
 import { Video } from '../../model'
 import { globalEm } from '../globalEm'
-import { predictLanguage } from '../language'
+import { predictVideoLanguage } from '../language'
 
 const logger = createLogger('setOrionLanguage')
 
@@ -25,8 +25,10 @@ async function setOrionLanguage() {
       hasMore = false
     } else {
       const updates = videos.map((video) => {
-        const languageText = [video.title ?? '', video.description ?? ''].join(' ')
-        video.orionLanguage = predictLanguage(languageText)
+        video.orionLanguage = predictVideoLanguage({
+          title: video.title ?? '',
+          description: video.description ?? '',
+        })
         return video
       })
 

diff --git a/src/utils/language.ts b/src/utils/language.ts
@@ -1,19 +1,36 @@
-import { detect } from 'tinyld'
+import { detectAll } from 'tinyld'
 
 function cleanString(input: string): string {
-  // Remove symbols, numbers, and emojis
-  // The regex [\p{P}\p{S}\p{N}\p{M}] matches all kinds of punctuation, symbols, numbers, and mark characters (including emojis)
-  // \p{P} matches any kind of punctuation character
-  // \p{S} matches any kind of math symbol, currency sign, or modifier symbol
-  // \p{N} matches any kind of numeric character in any script
-  // \p{M} matches characters that are combined with other characters, often used for emojis and diacritics
-  // The 'u' flag enables Unicode support, allowing the regex to match Unicode characters and properties
-  const cleanedString = input.replace(/[\p{P}\p{S}\p{N}\p{M}]/gu, '')
-  return cleanedString.toLowerCase()
+  // First, remove URLs. This pattern targets a broad range of URLs.
+  let cleanedString = input.replace(/(https?:\/\/[^\s]+)|(www\.[^\s]+)/gu, '')
+
+  // Remove hashtags. This regex looks for the '#' symbol followed by one or more word characters.
+  cleanedString = cleanedString.replace(/#\w+/gu, '')
+
+  return cleanedString
 }
 
-// Example usage
-export const predictLanguage = (text: string): string | undefined => {
+function predictLanguage(text: string): { lang: string; accuracy: number } | undefined {
   const cleanedText = cleanString(text)
-  return detect(cleanedText) || undefined
+
+  // Get the most accurate language prediction
+  return detectAll(cleanedText)?.[0]
+}
+
+export function predictVideoLanguage({ title, description }: any): string | undefined {
+  let detectedLang: string | undefined
+
+  const titleLang = predictLanguage(title ?? '')
+
+  detectedLang = titleLang?.lang
+
+  if ((titleLang?.accuracy || 0) < 0.5) {
+    const titleAndDescriptionLang = predictLanguage(`${title} ${description}`)
+    if ((titleAndDescriptionLang?.accuracy || 0) > (titleLang?.accuracy || 0)) {
+      // then
+      detectedLang = titleAndDescriptionLang?.lang
+    }
+  }
+
+  return detectedLang
 }