]> git.gir.st - ttxd.git/blob - plain.pl
noted that dvbtext and vtx2ascii had been modified
[ttxd.git] / plain.pl
1 #!/usr/bin/perl -X
2
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts plain text news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
7
8 # Usage: ./plain.pl <VTX-file>
9 # Output: Line 1: Heading (pageNo); following lines are news body
10
11 use strict;
12 use warnings;
13 use 5.010;
14 binmode STDOUT, ":encoding(utf8)";
15
16 # Seitenformat:
17 # 100-109:
18 # Metadaten: 1
19 # Subressort: 2
20 # Ressort/Sparte: 3
21 # Leer 4
22 # Related: 5
23 # Leer 6
24 # Titel: 7
25 # Text: 8-24
26 #
27 # 112-899:
28 # Metadaten: 1
29 # Subressort: 2
30 # Ressort/Sparte: 3
31 # Leer 4
32 # Titel: 5
33 # Text: 6-24
34 #
35
36 my %meta;
37 my $title;
38 my $text = "";
39 # TODO: run tzap/dvbtext in background if not already running
40 my $page = shift;
41 my $subp = 0; #shift; #TODO: could be undefined
42 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
43 open (VTX, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii");
44 my $last = "";
45 my $is_10x = 0;
46 do {
47 # transliterate from ETSI EN 300 706 G0 German to UTF-8:
48 tr/[\\]{|}~/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}/;
49 my $line = $_;
50 $line =~ s/^\s+|\s+$//g;
51 chomp ($line);
52
53 given ($.) {
54 when (1) { %meta = parse_metadata ($line) ; $is_10x = ($meta{'page'}<110) }
55 when (2) { $meta{'subres'} = $line }
56 when (3) { $meta{'res'} = $line }
57 when (4) {}
58 when (5 + (1*$is_10x)) { $title = $line }
59 when (4 + (1*$is_10x)) {}
60 when (4 + (3*$is_10x)) { $title .=$line }
61 default { $text .= $last ."|EOL|". ($last eq ""?"":($line eq ""?"\n":" ")) }
62 }
63 $last = $line unless $. == (5+(2*$is_10x));
64 } while (<VTX>);
65 $text .= $last;
66
67 $text =~ s/([[:lower:]])-\|EOL\| ([[:lower:]])/\1\2/g; #remove hyphenation only when in between lowercase letters
68 $text =~ s/\|EOL\|//g; #remove hyphenation only when in between lowercase letters
69
70 #ADBlocker
71 $text =~ s/ORF TELETEXT jetzt auch als App gratis im App-Store für iOS . Android//g;
72 $text =~ s/Weidenrinde bei R.ckenschmerzen >652 Onlineshop: www\.hafesan\.at//g;
73
74 #DEBUG:
75 print STDERR "Page: $meta{'page'}\tChannel: $meta{'channel'}\tDate: $meta{'date'}\n";
76 print STDERR "Ressort: $meta{'res'}\tSubressort: $meta{'subres'}\n";
77 print STDERR "is_10x: ", $is_10x?"yes":"no", "\n";
78
79 print $title, " ($meta{'page'})\n", $text;
80
81 close (VTX);
82
83 sub parse_metadata {
84 my @elems = split ' ', @_[0];
85
86 my %retval = (
87 'page' => shift @elems,
88 'channel' => shift @elems,
89 'date' => join (' ', @elems) ##date doesnt work TODO
90 );
91
92 return %retval;
93 }
Imprint / Impressum