]> git.gir.st - ttxd.git/blob - html.pl
update to dvb-t2 (new hardware, software)
[ttxd.git] / html.pl
1 #!/usr/bin/perl -X
2
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts hypertext formatted news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
7
8 # Usage: ./plain.pl <VTX-file>
9 # Output: HTML
10
11 use strict;
12 use warnings;
13 use 5.010;
14 # convert to utf-8 before outputting; WARN: still processes latin-1 (regexes too)!
15 binmode STDOUT, ":encoding(utf8)";
16
17 my %meta;
18 my $title;
19 my $text = "";
20 my $page = shift;
21 my ($subpage) = $page=~m/\d{3}_(\d{2})/; #requires ppp_ss file name scheme! @parens: https://stackoverflow.com/a/10034105
22 $subpage += 0; # convert to number to remove leading zero
23 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
24 open (VTX, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii");
25 my $last = "";
26 my $is_10x = 0;
27 do {
28 # transliterate from ETSI EN 300 706 G0 German to latin-1 (AOUaouBoS):
29 tr/[\\]{|}~`@/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}\N{U+B0}\N{U+A7}/;
30 my $line = $_;
31 $line =~ s/^\s+|\s+$//g;
32 chomp ($line);
33
34 given ($.) {
35 when (1) { %meta = parse_metadata ($line) ; $is_10x = ($meta{'page'}<110) }
36 when (2) { $meta{'subres'} = $line }
37 when (3) { $meta{'res'} = $line }
38 when (4) {}
39 when (5 + (1*$is_10x)) { $title = $line }
40 when (4 + (1*$is_10x)) {}
41 when (4 + (3*$is_10x)) { $title .=$line if ($title eq "")}
42 default { $text .= $last . "_EOL_" . ($last eq ""?"":($line eq ""?"<br>":" ")) }
43 }
44 $last = $line unless $. == (5+(2*$is_10x));
45 } while (<VTX>);
46 $text .= $last;
47
48 # substitute hyphenation:
49 # * replace with soft hyphen when it splits a word (between lowercase letters;
50 # still allows line break when necessary.
51 # * keep, when followed by uppercase letter (e.g. "PIN-Nummer")
52 # * keep, when after a digit (e.g "30-jaehriges")
53 # * remove in any other case (was: forced hyphenation due to space constraints)
54 # ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way
55 # due to single line regex. INFO: Underscore is in DE-localized teletext charset.
56 # ad s/und/UND/: otherwise, "foo- und barbaz" will become "foound barbaz"
57 $text =~ s/\bund\b/_UND_/g;
58 $text =~ s/([[:lower:]])-_EOL_ ([[:lower:]])/\1&shy;\2/g;
59 $text =~ s/([[:alnum:]])-_EOL_ ([[:upper:]])/\1-\2/g;
60 $text =~ s/([[:digit:]])-_EOL_ ([[:alnum:]])/\1-\2/g;
61 $text =~ s/_UND_/und/g;
62 $text =~ s/_EOL_//g;
63
64 # remove ORFText idiosyncrasies
65 $text =~ s/([[:alnum:]]),([[:alpha:]])/\1, \2/g; # no space after comma to save space...
66 $text =~ s/([[:alpha:]]),([[:alnum:]])/\1, \2/g; # ...but not between numbers
67 $text =~ s/([[:alnum:]])\.([[:upper:]])/\1. \2/g; # no space after period to save space... (WARN: breaks URLS like tirol.ORF.at)
68 $text =~ s/([[:alpha:]])\.([[:upper:][:digit:]])/\1. \2/g; # ...but not between numbers
69 $text =~ s/([[:alnum:]]):([[:alnum:]])/\1: \2/g; # no space after colon to save space... TODO: doesn't work
70
71 # adblocker: (keep it 7bit-ASCII; perl processes latin1, but this script is utf-8 encoded (output will be utf8 due to `binmode` at beginning))
72 $text =~ s/Kalendarium - t.glich neu \. 734//g;
73 $text =~ s/>>tirol\. ?ORF\.at//g;
74
75 my @tmp = split(' ',$meta{'date'});
76 my $shortdate = substr $tmp[1], 0, 6;
77 my $pagesubpage = $meta{'page'} . ($subpage > 0?".$subpage":"");
78 my $moreinfo = "$meta{'res'} - $meta{'subres'}; $meta{'date'}";
79 print "<p>$pagesubpage:</span> <b title='$moreinfo'>$title</b><br>$text</p>";
80
81 close (VTX);
82
83 sub parse_metadata {
84 my @elems = split ' ', @_[0];
85
86 my %retval = (
87 'page' => shift @elems,
88 'channel' => shift @elems,
89 'date' => join (' ', @elems)
90 );
91
92 return %retval;
93 }
Imprint / Impressum