#!/usr/bin/env perl | |
# Extracts raw text from CoNLL-U file. Uses newdoc and newpar tags when available. | |
# Copyright © 2017 Dan Zeman <[email protected]> | |
# License: GNU GPL | |
use utf8; | |
use open ':utf8'; | |
binmode(STDIN, ':utf8'); | |
binmode(STDOUT, ':utf8'); | |
binmode(STDERR, ':utf8'); | |
use Getopt::Long; | |
# Language code 'zh' or 'ja' will trigger Chinese-like text formatting. | |
my $language = 'en'; | |
GetOptions | |
( | |
'language=s' => \$language | |
); | |
my $chinese = $language =~ m/^(zh|ja)(_|$)/; | |
my @sentence = (); | |
my $text = ''; # from the text attribute of the sentence | |
my $ftext = ''; # from the word forms of the tokens | |
my $newpar = 0; | |
my $newdoc = 0; | |
my $buffer = ''; | |
my $start = 1; | |
my $mwtlast; | |
while(<>) | |
{ | |
push(@sentence, $_); | |
if(m/^\#\s*text\s*=\s*(.+)/) | |
{ | |
$text = $1; | |
} | |
elsif(m/^\#\s*newpar(\s|$)/i) | |
{ | |
$newpar = 1; | |
} | |
elsif(m/^\#\s*newdoc(\s|$)/i) | |
{ | |
$newdoc = 1; | |
} | |
elsif(m/^\d+-(\d+)\t/) | |
{ | |
$mwtlast = $1; | |
my @f = split(/\t/, $_); | |
# Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.) | |
# The first token of the new paragraph has "NewPar=Yes" in the MISC column. | |
# Multi-word tokens have this in the token-introducing line. | |
if($f[9] =~ m/NewPar=Yes/i) | |
{ | |
# Empty line between documents and paragraphs. (There may have been | |
# a paragraph break before the first part of this sentence as well!) | |
$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer); | |
$buffer .= $ftext; | |
# Line breaks at word boundaries after at most 80 characters. | |
$buffer = print_lines_from_buffer($buffer, 80, $chinese); | |
print("$buffer\n\n"); | |
$buffer = ''; | |
# Start is only true until we write the first sentence of the input stream. | |
$start = 0; | |
$newdoc = 0; | |
$newpar = 0; | |
$text = ''; | |
$ftext = ''; | |
} | |
$ftext .= $f[1]; | |
$ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/); | |
} | |
elsif(m/^(\d+)\t/ && !(defined($mwtlast) && $1<=$mwtlast)) | |
{ | |
$mwtlast = undef; | |
my @f = split(/\t/, $_); | |
# Paragraphs may start in the middle of a sentence (bulleted lists, verse etc.) | |
# The first token of the new paragraph has "NewPar=Yes" in the MISC column. | |
# Multi-word tokens have this in the token-introducing line. | |
if($f[9] =~ m/NewPar=Yes/i) | |
{ | |
# Empty line between documents and paragraphs. (There may have been | |
# a paragraph break before the first part of this sentence as well!) | |
$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer); | |
$buffer .= $ftext; | |
# Line breaks at word boundaries after at most 80 characters. | |
$buffer = print_lines_from_buffer($buffer, 80, $chinese); | |
print("$buffer\n\n"); | |
$buffer = ''; | |
# Start is only true until we write the first sentence of the input stream. | |
$start = 0; | |
$newdoc = 0; | |
$newpar = 0; | |
$text = ''; | |
$ftext = ''; | |
} | |
$ftext .= $f[1]; | |
$ftext .= ' ' unless($f[9] =~ m/SpaceAfter=No/); | |
} | |
elsif(m/^\s*$/) | |
{ | |
# In a valid CoNLL-U file, $text should be equal to $ftext except for the | |
# space after the last token. However, if there have been intra-sentential | |
# paragraph breaks, $ftext contains only the part after the last such | |
# break, and $text is empty. Hence we currently use $ftext everywhere | |
# and ignore $text, even though we note it when seeing the text attribute. | |
# $text .= ' ' unless($chinese); | |
# Empty line between documents and paragraphs. | |
$buffer = print_new_paragraph_if_needed($start, $newdoc, $newpar, $buffer); | |
$buffer .= $ftext; | |
# Line breaks at word boundaries after at most 80 characters. | |
$buffer = print_lines_from_buffer($buffer, 80, $chinese); | |
# Start is only true until we write the first sentence of the input stream. | |
$start = 0; | |
$newdoc = 0; | |
$newpar = 0; | |
$text = ''; | |
$ftext = ''; | |
$mwtlast = undef; | |
} | |
} | |
# There may be unflushed buffer contents after the last sentence, less than 80 characters | |
# (otherwise we would have already dealt with it), so just flush it. | |
if($buffer ne '') | |
{ | |
print("$buffer\n"); | |
} | |
#------------------------------------------------------------------------------ | |
# Checks whether we have to print an extra line to separate paragraphs. Does it | |
# if necessary. Returns the updated buffer. | |
#------------------------------------------------------------------------------ | |
sub print_new_paragraph_if_needed | |
{ | |
my $start = shift; | |
my $newdoc = shift; | |
my $newpar = shift; | |
my $buffer = shift; | |
if(!$start && ($newdoc || $newpar)) | |
{ | |
if($buffer ne '') | |
{ | |
print("$buffer\n"); | |
$buffer = ''; | |
} | |
print("\n"); | |
} | |
return $buffer; | |
} | |
#------------------------------------------------------------------------------ | |
# Prints as many complete lines of text as there are in the buffer. Returns the | |
# remaining contents of the buffer. | |
#------------------------------------------------------------------------------ | |
sub print_lines_from_buffer | |
{ | |
my $buffer = shift; | |
# Maximum number of characters allowed on one line, not counting the line | |
# break character(s), which also replace any number of trailing spaces. | |
# Exception: If there is a word longer than the limit, it will be printed | |
# on one line. | |
# Note that this algorithm is not suitable for Chinese and Japanese. | |
my $limit = shift; | |
# We need a different algorithm for Chinese and Japanese. | |
my $chinese = shift; | |
if($chinese) | |
{ | |
return print_chinese_lines_from_buffer($buffer, $limit); | |
} | |
if(length($buffer) >= $limit) | |
{ | |
my @cbuffer = split(//, $buffer); | |
# There may be more than one new line waiting in the buffer. | |
while(scalar(@cbuffer) >= $limit) | |
{ | |
###!!! We could make it simpler if we ignored multi-space sequences | |
###!!! between words. It sounds OK to ignore them because at the | |
###!!! line break we do not respect original spacing anyway. | |
my $i; | |
my $ilastspace; | |
for($i = 0; $i<=$#cbuffer; $i++) | |
{ | |
if($i>$limit && defined($ilastspace)) | |
{ | |
last; | |
} | |
if($cbuffer[$i] =~ m/\s/) | |
{ | |
$ilastspace = $i; | |
} | |
} | |
if(defined($ilastspace) && $ilastspace>0) | |
{ | |
my @out = @cbuffer[0..($ilastspace-1)]; | |
splice(@cbuffer, 0, $ilastspace+1); | |
print(join('', @out), "\n"); | |
} | |
else | |
{ | |
print(join('', @cbuffer), "\n"); | |
splice(@cbuffer); | |
} | |
} | |
$buffer = join('', @cbuffer); | |
} | |
return $buffer; | |
} | |
#------------------------------------------------------------------------------ | |
# Prints as many complete lines of text as there are in the buffer. Returns the | |
# remaining contents of the buffer. Assumes that there are no spaces between | |
# words and lines can be broken between any two characters, as is the custom in | |
# Chinese and Japanese. | |
#------------------------------------------------------------------------------ | |
sub print_chinese_lines_from_buffer | |
{ | |
my $buffer = shift; | |
# Maximum number of characters allowed on one line, not counting the line | |
# break character(s). | |
my $limit = shift; | |
# We cannot simply print the first $limit characters from the buffer, | |
# followed by a line break. There could be embedded Latin words or | |
# numbers and we do not want to insert a line break in the middle of | |
# a foreign word. | |
my @cbuffer = split(//, $buffer); | |
while(scalar(@cbuffer) >= $limit) | |
{ | |
my $nprint = 0; | |
for(my $i = 0; $i <= $#cbuffer; $i++) | |
{ | |
if($i > $limit && $nprint > 0) | |
{ | |
last; | |
} | |
unless($i < $#cbuffer && $cbuffer[$i] =~ m/[\p{Latin}0-9]/ && $cbuffer[$i+1] =~ m/[\p{Latin}0-9]/) | |
{ | |
$nprint = $i+1; | |
} | |
} | |
my @out = @cbuffer[0..($nprint-1)]; | |
splice(@cbuffer, 0, $nprint); | |
print(join('', @out), "\n"); | |
} | |
$buffer = join('', @cbuffer); | |
return $buffer; | |
} | |