]>
git.gir.st - ttxd.git/blob - plain.pl
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts plain text news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
8 # Usage: ./plain.pl <VTX-file>
9 # Output: Line 1: Heading (pageNo); following lines are news body
14 binmode STDOUT
, ":encoding(utf8)";
39 # TODO: run tzap/dvbtext in background if not already running
41 my $subp = 0; #shift; #TODO: could be undefined
42 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
43 open (VTX
, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii");
47 # transliterate from ETSI EN 300 706 G0 German to UTF-8:
48 tr/[\\]{|}~/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}/;
50 $line =~ s/^\s+|\s+$//g;
54 when (1) { %meta = parse_metadata
($line) ; $is_10x = ($meta{'page'}<110) }
55 when (2) { $meta{'subres'} = $line }
56 when (3) { $meta{'res'} = $line }
58 when (5 + (1*$is_10x)) { $title = $line }
59 when (4 + (1*$is_10x)) {}
60 when (4 + (3*$is_10x)) { $title .=$line }
61 default { $text .= $last ."|EOL|". ($last eq ""?
"":($line eq ""?
"\n":" ")) }
63 $last = $line unless $. == (5+(2*$is_10x));
67 $text =~ s/([[:lower:]])-\|EOL\| ([[:lower:]])/\1\2/g; #remove hyphenation only when in between lowercase letters
68 $text =~ s/\|EOL\|//g; #remove hyphenation only when in between lowercase letters
71 $text =~ s/ORF TELETEXT jetzt auch als App gratis im App-Store für iOS . Android//g;
72 $text =~ s/Weidenrinde bei R.ckenschmerzen >652 Onlineshop: www\.hafesan\.at//g;
75 print STDERR
"Page: $meta{'page'}\tChannel: $meta{'channel'}\tDate: $meta{'date'}\n";
76 print STDERR
"Ressort: $meta{'res'}\tSubressort: $meta{'subres'}\n";
77 print STDERR
"is_10x: ", $is_10x?"yes":"no", "\n";
79 print $title, " ($meta{'page'})\n", $text;
84 my @elems = split ' ', @_[0];
87 'page' => shift @elems,
88 'channel' => shift @elems,
89 'date' => join (' ', @elems) ##date doesnt work TODO