Skip to content

Commit

Permalink
fix: video language detection fix (#309)
Browse files Browse the repository at this point in the history
* fix: video language detection fix

* address requested changes

* fix: predictVideoLanguage function
  • Loading branch information
zeeshanakram3 authored Mar 8, 2024
1 parent 3127350 commit 8e5b98b
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 24 deletions.
20 changes: 12 additions & 8 deletions src/mappings/content/video.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { generateAppActionCommitment } from '@joystream/js/utils'
import {
AppAction,
AppActionMetadata,
Expand All @@ -8,13 +9,14 @@ import { DecodedMetadataObject } from '@joystream/metadata-protobuf/types'
import { integrateMeta } from '@joystream/metadata-protobuf/utils'
import {
Channel,
Video,
VideoViewEvent,
Event,
Video,
VideoCreatedEventData,
VideoPosted,
VideoViewEvent,
} from '../../model'
import { EventHandlerContext } from '../../utils/events'
import { predictVideoLanguage } from '../../utils/language'
import { deserializeMetadata, u8aToBytes, videoRelevanceManager } from '../utils'
import { processVideoMetadata } from './metadata'
import {
Expand All @@ -26,8 +28,6 @@ import {
processAppActionMetadata,
processNft,
} from './utils'
import { generateAppActionCommitment } from '@joystream/js/utils'
import { predictLanguage } from '../../utils/language'

export async function processVideoCreatedEvent({
overlay,
Expand Down Expand Up @@ -115,8 +115,10 @@ export async function processVideoCreatedEvent({
}
}

const languageText = [video.title ?? '', video.description ?? ''].join(' ')
video.orionLanguage = predictLanguage(languageText)
video.orionLanguage = predictVideoLanguage({
title: video.title ?? '',
description: video.description ?? '',
})

channel.totalVideosCreated += 1

Expand Down Expand Up @@ -183,8 +185,10 @@ export async function processVideoUpdatedEvent({
)
}

const languageText = [video.title ?? '', video.description ?? ''].join(' ')
video.orionLanguage = predictLanguage(languageText)
video.orionLanguage = predictVideoLanguage({
title: video.title ?? '',
description: video.description ?? '',
})

if (autoIssueNft) {
await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft)
Expand Down
8 changes: 5 additions & 3 deletions src/utils/customMigrations/setOrionLanguage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { createLogger } from '@subsquid/logger'
import { IsNull } from 'typeorm'
import { Video } from '../../model'
import { globalEm } from '../globalEm'
import { predictLanguage } from '../language'
import { predictVideoLanguage } from '../language'

const logger = createLogger('setOrionLanguage')

Expand All @@ -25,8 +25,10 @@ async function setOrionLanguage() {
hasMore = false
} else {
const updates = videos.map((video) => {
const languageText = [video.title ?? '', video.description ?? ''].join(' ')
video.orionLanguage = predictLanguage(languageText)
video.orionLanguage = predictVideoLanguage({
title: video.title ?? '',
description: video.description ?? '',
})
return video
})

Expand Down
43 changes: 30 additions & 13 deletions src/utils/language.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,36 @@
import { detect } from 'tinyld'
import { detectAll } from 'tinyld'

function cleanString(input: string): string {
// Remove symbols, numbers, and emojis
// The regex [\p{P}\p{S}\p{N}\p{M}] matches all kinds of punctuation, symbols, numbers, and mark characters (including emojis)
// \p{P} matches any kind of punctuation character
// \p{S} matches any kind of math symbol, currency sign, or modifier symbol
// \p{N} matches any kind of numeric character in any script
// \p{M} matches characters that are combined with other characters, often used for emojis and diacritics
// The 'u' flag enables Unicode support, allowing the regex to match Unicode characters and properties
const cleanedString = input.replace(/[\p{P}\p{S}\p{N}\p{M}]/gu, '')
return cleanedString.toLowerCase()
// First, remove URLs. This pattern targets a broad range of URLs.
let cleanedString = input.replace(/(https?:\/\/[^\s]+)|(www\.[^\s]+)/gu, '')

// Remove hashtags. This regex looks for the '#' symbol followed by one or more word characters.
cleanedString = cleanedString.replace(/#\w+/gu, '')

return cleanedString
}

// Example usage
export const predictLanguage = (text: string): string | undefined => {
function predictLanguage(text: string): { lang: string; accuracy: number } | undefined {
const cleanedText = cleanString(text)
return detect(cleanedText) || undefined

// Get the most accurate language prediction
return detectAll(cleanedText)?.[0]
}

export function predictVideoLanguage({ title, description }: any): string | undefined {
let detectedLang: string | undefined

const titleLang = predictLanguage(title ?? '')

detectedLang = titleLang?.lang

if ((titleLang?.accuracy || 0) < 0.5) {
const titleAndDescriptionLang = predictLanguage(`${title} ${description}`)
if ((titleAndDescriptionLang?.accuracy || 0) > (titleLang?.accuracy || 0)) {
// then
detectedLang = titleAndDescriptionLang?.lang
}
}

return detectedLang
}

0 comments on commit 8e5b98b

Please sign in to comment.