diff --git a/src/mappings/content/video.ts b/src/mappings/content/video.ts index 931524ba4..1b8c78aec 100644 --- a/src/mappings/content/video.ts +++ b/src/mappings/content/video.ts @@ -1,3 +1,4 @@ +import { generateAppActionCommitment } from '@joystream/js/utils' import { AppAction, AppActionMetadata, @@ -8,13 +9,14 @@ import { DecodedMetadataObject } from '@joystream/metadata-protobuf/types' import { integrateMeta } from '@joystream/metadata-protobuf/utils' import { Channel, - Video, - VideoViewEvent, Event, + Video, VideoCreatedEventData, VideoPosted, + VideoViewEvent, } from '../../model' import { EventHandlerContext } from '../../utils/events' +import { predictVideoLanguage } from '../../utils/language' import { deserializeMetadata, u8aToBytes, videoRelevanceManager } from '../utils' import { processVideoMetadata } from './metadata' import { @@ -26,8 +28,6 @@ import { processAppActionMetadata, processNft, } from './utils' -import { generateAppActionCommitment } from '@joystream/js/utils' -import { predictLanguage } from '../../utils/language' export async function processVideoCreatedEvent({ overlay, @@ -115,8 +115,10 @@ export async function processVideoCreatedEvent({ } } - const languageText = [video.title ?? '', video.description ?? ''].join(' ') - video.orionLanguage = predictLanguage(languageText) + video.orionLanguage = predictVideoLanguage({ + title: video.title ?? '', + description: video.description ?? '', + }) channel.totalVideosCreated += 1 @@ -183,8 +185,10 @@ export async function processVideoUpdatedEvent({ ) } - const languageText = [video.title ?? '', video.description ?? ''].join(' ') - video.orionLanguage = predictLanguage(languageText) + video.orionLanguage = predictVideoLanguage({ + title: video.title ?? '', + description: video.description ?? '', + }) if (autoIssueNft) { await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft) diff --git a/src/utils/customMigrations/setOrionLanguage.ts b/src/utils/customMigrations/setOrionLanguage.ts index 7296ce8d2..a75460ca9 100644 --- a/src/utils/customMigrations/setOrionLanguage.ts +++ b/src/utils/customMigrations/setOrionLanguage.ts @@ -2,7 +2,7 @@ import { createLogger } from '@subsquid/logger' import { IsNull } from 'typeorm' import { Video } from '../../model' import { globalEm } from '../globalEm' -import { predictLanguage } from '../language' +import { predictVideoLanguage } from '../language' const logger = createLogger('setOrionLanguage') @@ -25,8 +25,10 @@ async function setOrionLanguage() { hasMore = false } else { const updates = videos.map((video) => { - const languageText = [video.title ?? '', video.description ?? ''].join(' ') - video.orionLanguage = predictLanguage(languageText) + video.orionLanguage = predictVideoLanguage({ + title: video.title ?? '', + description: video.description ?? '', + }) return video }) diff --git a/src/utils/language.ts b/src/utils/language.ts index 03cd577f8..a9f8c2b45 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -1,19 +1,36 @@ -import { detect } from 'tinyld' +import { detectAll } from 'tinyld' function cleanString(input: string): string { - // Remove symbols, numbers, and emojis - // The regex [\p{P}\p{S}\p{N}\p{M}] matches all kinds of punctuation, symbols, numbers, and mark characters (including emojis) - // \p{P} matches any kind of punctuation character - // \p{S} matches any kind of math symbol, currency sign, or modifier symbol - // \p{N} matches any kind of numeric character in any script - // \p{M} matches characters that are combined with other characters, often used for emojis and diacritics - // The 'u' flag enables Unicode support, allowing the regex to match Unicode characters and properties - const cleanedString = input.replace(/[\p{P}\p{S}\p{N}\p{M}]/gu, '') - return cleanedString.toLowerCase() + // First, remove URLs. This pattern targets a broad range of URLs. + let cleanedString = input.replace(/(https?:\/\/[^\s]+)|(www\.[^\s]+)/gu, '') + + // Remove hashtags. This regex looks for the '#' symbol followed by one or more word characters. + cleanedString = cleanedString.replace(/#\w+/gu, '') + + return cleanedString } -// Example usage -export const predictLanguage = (text: string): string | undefined => { +function predictLanguage(text: string): { lang: string; accuracy: number } | undefined { const cleanedText = cleanString(text) - return detect(cleanedText) || undefined + + // Get the most accurate language prediction + return detectAll(cleanedText)?.[0] +} + +export function predictVideoLanguage({ title, description }: any): string | undefined { + let detectedLang: string | undefined + + const titleLang = predictLanguage(title ?? '') + + detectedLang = titleLang?.lang + + if ((titleLang?.accuracy || 0) < 0.5) { + const titleAndDescriptionLang = predictLanguage(`${title} ${description}`) + if ((titleAndDescriptionLang?.accuracy || 0) > (titleLang?.accuracy || 0)) { + // then + detectedLang = titleAndDescriptionLang?.lang + } + } + + return detectedLang }