Skip to content

Commit b62027a

Browse files
authored
[FIX] Issue#1665 Enhanced Matroska Language Tag Handling (#1671)
* fix unknown element for IETF tag * added documentation changes * added formatting for clang-format
1 parent 9685ad6 commit b62027a

File tree

3 files changed

+95
-27
lines changed

3 files changed

+95
-27
lines changed

docs/CHANGES.TXT

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
1.0 (to be released)
22
-----------------
3+
- Fix: Improved handling of IETF language tags in Matroska files (#1665)
34
- New: Create unit test for rust code (#1615)
45
- Breaking: Major argument flags revamp for CCExtractor (#1564 & #1619)
56
- New: Create a Docker image to simplify the CCExtractor usage without any environmental hustle (#1611)

src/lib_ccx/matroska.c

Lines changed: 92 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,13 @@ void parse_ebml(FILE *file)
154154
default:
155155
if (code_len == MATROSKA_MAX_ID_LENGTH)
156156
{
157-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping EBML block\n", code,
157+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
158158
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
159-
set_bytes(file, pos + len);
160-
return;
159+
// Skip just the unknown element, not the entire block
160+
read_vint_block_skip(file);
161+
// Reset code and code_len to start fresh with next element
162+
code = 0;
163+
code_len = 0;
161164
}
162165
break;
163166
}
@@ -232,10 +235,13 @@ void parse_segment_info(FILE *file)
232235
default:
233236
if (code_len == MATROSKA_MAX_ID_LENGTH)
234237
{
235-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping segment info block\n", code,
238+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
236239
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
237-
set_bytes(file, pos + len);
238-
return;
240+
// Skip just the unknown element, not the entire block
241+
read_vint_block_skip(file);
242+
// Reset code and code_len to start fresh with next element
243+
code = 0;
244+
code_len = 0;
239245
}
240246
break;
241247
}
@@ -489,10 +495,13 @@ void parse_segment_cluster_block_group(struct matroska_ctx *mkv_ctx, ULLONG clus
489495
default:
490496
if (code_len == MATROSKA_MAX_ID_LENGTH)
491497
{
492-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping segment cluster block group\n", code,
498+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
493499
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
494-
set_bytes(file, pos + len);
495-
return;
500+
// Skip just the unknown element, not the entire block
501+
read_vint_block_skip(file);
502+
// Reset code and code_len to start fresh with next element
503+
code = 0;
504+
code_len = 0;
496505
}
497506
break;
498507
}
@@ -597,10 +606,13 @@ void parse_segment_cluster(struct matroska_ctx *mkv_ctx)
597606
default:
598607
if (code_len == MATROSKA_MAX_ID_LENGTH)
599608
{
600-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping segment cluster block\n", code,
609+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
601610
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
602-
set_bytes(file, pos + len);
603-
return;
611+
// Skip just the unknown element, not the entire block
612+
read_vint_block_skip(file);
613+
// Reset code and code_len to start fresh with next element
614+
code = 0;
615+
code_len = 0;
604616
}
605617
break;
606618
}
@@ -728,6 +740,7 @@ void parse_segment_track_entry(struct matroska_ctx *mkv_ctx)
728740
enum matroska_track_entry_type track_type = MATROSKA_TRACK_TYPE_VIDEO;
729741
char *lang = strdup("eng");
730742
char *header = NULL;
743+
char *lang_ietf = NULL;
731744
char *codec_id_string = NULL;
732745
enum matroska_track_subtitle_codec_id codec_id = MATROSKA_TRACK_SUBTITLE_CODEC_ID_UTF8;
733746

@@ -863,6 +876,31 @@ void parse_segment_track_entry(struct matroska_ctx *mkv_ctx)
863876
case MATROSKA_SEGMENT_TRACK_TRICK_MASTER_TRACK_SEGMENT_UID:
864877
read_vint_block_skip(file);
865878
MATROSKA_SWITCH_BREAK(code, code_len);
879+
case MATROSKA_SEGMENT_TRACK_LANGUAGE_IETF:
880+
lang_ietf = read_vint_block_string(file);
881+
mprint(" Language IETF: %s\n", lang_ietf);
882+
// We'll store this for later use rather than freeing it immediately
883+
if (track_type == MATROSKA_TRACK_TYPE_SUBTITLE)
884+
{
885+
// Don't free lang_ietf here, store in track
886+
if (lang != NULL)
887+
{
888+
// If we previously allocated lang, free it as we'll prefer IETF
889+
free(lang);
890+
lang = NULL;
891+
}
892+
// Default to "eng" if we somehow don't have a language yet
893+
if (lang == NULL)
894+
{
895+
lang = strdup("eng");
896+
}
897+
}
898+
else
899+
{
900+
free(lang_ietf); // Free if not a subtitle track
901+
lang_ietf = NULL;
902+
}
903+
MATROSKA_SWITCH_BREAK(code, code_len);
866904

867905
/* Misc ids */
868906
case MATROSKA_VOID:
@@ -874,10 +912,13 @@ void parse_segment_track_entry(struct matroska_ctx *mkv_ctx)
874912
default:
875913
if (code_len == MATROSKA_MAX_ID_LENGTH)
876914
{
877-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping segment track entry block\n", code,
915+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
878916
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
879-
set_bytes(file, pos + len);
880-
return;
917+
// Skip just the unknown element, not the entire block
918+
read_vint_block_skip(file);
919+
// Reset code and code_len to start fresh with next element
920+
code = 0;
921+
code_len = 0;
881922
}
882923
break;
883924
}
@@ -888,6 +929,7 @@ void parse_segment_track_entry(struct matroska_ctx *mkv_ctx)
888929
struct matroska_sub_track *sub_track = malloc(sizeof(struct matroska_sub_track));
889930
sub_track->header = header;
890931
sub_track->lang = lang;
932+
sub_track->lang_ietf = lang_ietf;
891933
sub_track->track_number = track_number;
892934
sub_track->lang_index = 0;
893935
sub_track->codec_id = codec_id;
@@ -904,6 +946,8 @@ void parse_segment_track_entry(struct matroska_ctx *mkv_ctx)
904946
else
905947
{
906948
free(lang);
949+
if (lang_ietf)
950+
free(lang_ietf);
907951
if (codec_id_string)
908952
free(codec_id_string);
909953
}
@@ -997,10 +1041,13 @@ void parse_segment_tracks(struct matroska_ctx *mkv_ctx)
9971041
default:
9981042
if (code_len == MATROSKA_MAX_ID_LENGTH)
9991043
{
1000-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping segment tracks block\n", code,
1044+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
10011045
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
1002-
set_bytes(file, pos + len);
1003-
return;
1046+
// Skip just the unknown element, not the entire block
1047+
read_vint_block_skip(file);
1048+
// Reset code and code_len to start fresh with next element
1049+
code = 0;
1050+
code_len = 0;
10041051
}
10051052
break;
10061053
}
@@ -1058,10 +1105,13 @@ void parse_segment(struct matroska_ctx *mkv_ctx)
10581105
default:
10591106
if (code_len == MATROSKA_MAX_ID_LENGTH)
10601107
{
1061-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping segment block\n", code,
1108+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
10621109
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
1063-
set_bytes(file, pos + len);
1064-
return;
1110+
// Skip just the unknown element, not the entire block
1111+
read_vint_block_skip(file);
1112+
// Reset code and code_len to start fresh with next element
1113+
code = 0;
1114+
code_len = 0;
10651115
}
10661116
break;
10671117
}
@@ -1071,11 +1121,15 @@ void parse_segment(struct matroska_ctx *mkv_ctx)
10711121
char *generate_filename_from_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
10721122
{
10731123
char *buf = malloc(sizeof(char) * 200);
1124+
// Use lang_ietf if available, otherwise fall back to lang
1125+
const char *lang_to_use = track->lang_ietf ? track->lang_ietf : track->lang;
1126+
10741127
if (track->lang_index == 0)
1075-
sprintf(buf, "%s_%s.%s", get_basename(mkv_ctx->filename), track->lang, matroska_track_text_subtitle_id_extensions[track->codec_id]);
1076-
else
1077-
sprintf(buf, "%s_%s_" LLD ".%s", get_basename(mkv_ctx->filename), track->lang, track->lang_index,
1128+
sprintf(buf, "%s_%s.%s", get_basename(mkv_ctx->filename), lang_to_use,
10781129
matroska_track_text_subtitle_id_extensions[track->codec_id]);
1130+
else
1131+
sprintf(buf, "%s_%s_" LLD ".%s", get_basename(mkv_ctx->filename), lang_to_use,
1132+
track->lang_index, matroska_track_text_subtitle_id_extensions[track->codec_id]);
10791133
return buf;
10801134
}
10811135

@@ -1263,6 +1317,8 @@ void free_sub_track(struct matroska_sub_track *track)
12631317
free(track->header);
12641318
if (track->lang != NULL)
12651319
free(track->lang);
1320+
if (track->lang_ietf != NULL)
1321+
free(track->lang_ietf);
12661322
if (track->codec_id_string != NULL)
12671323
free(track->codec_id_string);
12681324
for (int i = 0; i < track->sentence_count; i++)
@@ -1281,7 +1337,12 @@ void matroska_save_all(struct matroska_ctx *mkv_ctx, char *lang)
12811337
{
12821338
if (lang)
12831339
{
1284-
if ((match = strstr(lang, mkv_ctx->sub_tracks[i]->lang)) != NULL)
1340+
// Try to match against IETF tag first if available
1341+
if (mkv_ctx->sub_tracks[i]->lang_ietf &&
1342+
(match = strstr(lang, mkv_ctx->sub_tracks[i]->lang_ietf)) != NULL)
1343+
save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]);
1344+
// Fall back to 3-letter code
1345+
else if ((match = strstr(lang, mkv_ctx->sub_tracks[i]->lang)) != NULL)
12851346
save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]);
12861347
}
12871348
else
@@ -1337,9 +1398,13 @@ void matroska_parse(struct matroska_ctx *mkv_ctx)
13371398
default:
13381399
if (code_len == MATROSKA_MAX_ID_LENGTH)
13391400
{
1340-
mprint(MATROSKA_ERROR "Unknown element 0x%x at position " LLD ", skipping file parsing\n", code,
1401+
mprint(MATROSKA_WARNING "Unknown element 0x%x at position " LLD ", skipping this element\n", code,
13411402
get_current_byte(file) - MATROSKA_MAX_ID_LENGTH);
1342-
return;
1403+
// Skip just the unknown element, not the entire block
1404+
read_vint_block_skip(file);
1405+
// Reset code and code_len to start fresh with next element
1406+
code = 0;
1407+
code_len = 0;
13431408
}
13441409
break;
13451410
}

src/lib_ccx/matroska.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
/* Misc ids */
121121
#define MATROSKA_VOID 0xEC
122122
#define MATROSKA_CRC32 0xBF
123+
#define MATROSKA_SEGMENT_TRACK_LANGUAGE_IETF 0x22B59D
123124

124125
/* DEFENCE FROM THE FOOL - deprecated IDs */
125126
#define MATROSKA_SEGMENT_TRACK_TRACK_TIMECODE_SCALE 0x23314F
@@ -214,6 +215,7 @@ struct matroska_avc_frame {
214215
struct matroska_sub_track {
215216
char* header; // Style header for ASS/SSA (and other) subtitles
216217
char* lang;
218+
char *lang_ietf; //IETF language tag (BCP47)
217219
ULLONG track_number;
218220
ULLONG lang_index;
219221
enum matroska_track_subtitle_codec_id codec_id;

0 commit comments

Comments
 (0)