-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathcheck-space-after-paragraph.pl
executable file
·73 lines (71 loc) · 2.1 KB
/
check-space-after-paragraph.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env perl
# Checks whether SpaceAfter=No does not occur at the end of a paragraph.
# Note that such errors cause malfunction of conllu_to_text.pl, which generates the new paragraph and ignores SpaceAfter=No.
# Copyright © 2020 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
my $iline = 0;
my $ignore_until;
my $spaceafternoline;
my $sentid;
my $spaceaftersentid;
while(<>)
{
my $line = $_;
chomp($line);
$iline++;
# Remember SpaceAfter=No.
if($line =~ m/^\d/)
{
my @f = split(/\t/, $line);
# Multi-word tokens need a special treatment.
if($f[0] =~ m/^(\d+)-(\d+)$/)
{
my $id0 = $1;
my $id1 = $2;
$ignore_until = $id1;
}
if($f[0] =~ m/^\d+$/ && defined($ignore_until) && $f[0] > $ignore_until)
{
$ignore_until = undef;
}
if($f[0] =~ m/^\d+-\d+$/ || !defined($ignore_until))
{
my @misc = split(/\|/, $f[9]);
if(grep {$_ eq 'SpaceAfter=No'} (@misc))
{
$spaceafternoline = $iline;
$spaceaftersentid = $sentid;
}
else
{
$spaceafternoline = undef;
$spaceaftersentid = undef;
}
}
}
elsif($line =~ m/^\s*$/)
{
# Reset $ignore_until at the end of the sentence if we did not reset it earlier.
$ignore_until = undef;
}
elsif($line =~ m/^\#\s*new(doc|par)(\s|$)/)
{
# It is possible that there is no space between two sentences.
# But it is not possible between two paragraphs or documents.
if(defined($spaceafternoline))
{
print STDERR ("Line $iline: new paragraph or document was preceded by SpaceAfter=No on line $spaceafternoline (sentence $spaceaftersentid).\n");
$spaceafternoline = undef;
$spaceaftersentid = undef;
}
}
elsif($line =~ m/^\#\s*sent_id\s*=\s*(\S+)/)
{
$sentid = $1;
}
}