]> git.gir.st - ttxd.git/blob - html.pl
remove some edge cases in html.pl
[ttxd.git] / html.pl
1 #!/usr/bin/perl -X
2
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts hypertext formatted news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
7
8 # Usage: ./plain.pl <VTX-file>
9 # Output: HTML
10
11 use strict;
12 use warnings;
13 use 5.010;
14 # convert to utf-8 before outputting; WARN: still processes latin-1 (regexes too)!
15 binmode STDOUT, ":encoding(utf8)";
16
17 my %meta;
18 my $title;
19 my $text = "";
20 # TODO: run tzap/dvbtext in background if not already running
21 my $page = shift;
22 my ($subpage) = $page=~m/\d{3}_(\d{2})/; #requires ppp_ss file name scheme! @parens: https://stackoverflow.com/a/10034105
23 $subpage += 0; # convert to number to remove leading zero
24 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
25 open (VTX, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii");
26 my $last = "";
27 my $is_10x = 0;
28 do {
29 # transliterate from ETSI EN 300 706 G0 German to latin-1 (ÄÖÜäöüß°§):
30 tr/[\\]{|}~`@/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}\N{U+B0}\N{U+A7}/;
31 my $line = $_;
32 $line =~ s/^\s+|\s+$//g;
33 chomp ($line);
34
35 given ($.) {
36 when (1) { %meta = parse_metadata ($line) ; $is_10x = ($meta{'page'}<110) }
37 when (2) { $meta{'subres'} = $line }
38 when (3) { $meta{'res'} = $line }
39 when (4) {}
40 when (5 + (1*$is_10x)) { $title = $line }
41 when (4 + (1*$is_10x)) {}
42 when (4 + (3*$is_10x)) { $title .=$line if ($title eq "")}
43 default { $text .= $last . "_EOL_" . ($last eq ""?"":($line eq ""?"<br>":" ")) }
44 }
45 $last = $line unless $. == (5+(2*$is_10x));
46 } while (<VTX>);
47 $text .= $last;
48
49 # substitute hyphenation:
50 # * replace with soft hyphen when it splits a word (between lowercase letters;
51 # still allows line break when necessary.
52 # * keep, when followed by uppercase letter (e.g. "03-Jährige", "PIN-Nummer")
53 # * remove in any other case (was: forced hyphenation due to space constraints)
54 # ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way
55 # due to single line regex. INFO: Underscore is in DE-localized teletext charset.
56 $text =~ s/([[:lower:]])-_EOL_ ([[:lower:]])/\1&shy;\2/g;
57 $text =~ s/([[:alnum:]])-_EOL_ ([[:upper:]])/\1-\2/g;
58 $text =~ s/_EOL_//g;
59
60 # remove ORFText idiosyncrasies
61 $text =~ s/([[:alnum:]]),([[:alnum:]])/\1, \2/g; # no space after comma to save space
62 $text =~ s/([[:alnum:]])\.([[:upper:][:digit:]])/\1. \2/g; # no space after period to save space (WARN: breaks URLS like tirol.ORF.at)
63
64 # adblocker: (keep it 7bit-ASCII; perl processes latin1, but output will be utf8)
65 $text =~ s/Kalendarium - t.glich neu \. 734//g;
66 $text =~ s/>>tirol\. ?ORF\.at//g;
67
68 my @tmp = split(' ',$meta{'date'});
69 my $shortdate = substr $tmp[1], 0, 6;
70 my $pagesubpage = $meta{'page'} . ($subpage > 0?".$subpage":"");
71 my $moreinfo = "$meta{'res'} - $meta{'subres'}; $meta{'date'}";
72 print "<p>$pagesubpage:</span> <b title='$moreinfo' onclick='javascript:alert(\"$moreinfo\")'>$title</b><br>$text</p>";
73
74 close (VTX);
75
76 sub parse_metadata {
77 my @elems = split ' ', @_[0];
78
79 my %retval = (
80 'page' => shift @elems,
81 'channel' => shift @elems,
82 'date' => join (' ', @elems)
83 );
84
85 return %retval;
86 }
Imprint / Impressum