]>
git.gir.st - ttxd.git/blob - html.pl
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts hypertext formatted news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
8 # Usage: ./plain.pl <VTX-file>
14 # convert to utf-8 before outputting; WARN: still processes latin-1 (regexes too)!
15 binmode STDOUT
, ":encoding(utf8)";
20 # TODO: run tzap/dvbtext in background if not already running
22 my ($subpage) = $page=~m/\d{3}_
(\d
{2})/; #requires ppp_ss file name scheme! @parens: https://stackoverflow.com/a/10034105
23 $subpage += 0; # convert to number to remove leading zero
24 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
25 open (VTX
, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii");
29 # transliterate from ETSI EN 300 706 G0 German to latin-1 (ÄÖÜäöüß°§):
30 tr/[\\]{|}~`@/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}\N{U+B0}\N{U+A7}/;
32 $line =~ s/^\s+|\s+$//g;
36 when (1) { %meta = parse_metadata
($line) ; $is_10x = ($meta{'page'}<110) }
37 when (2) { $meta{'subres'} = $line }
38 when (3) { $meta{'res'} = $line }
40 when (5 + (1*$is_10x)) { $title = $line }
41 when (4 + (1*$is_10x)) {}
42 when (4 + (3*$is_10x)) { $title .=$line if ($title eq "")}
43 default { $text .= $last . "_EOL_" . ($last eq ""?
"":($line eq ""?
"<br>":" ")) }
45 $last = $line unless $. == (5+(2*$is_10x));
49 # substitute hyphenation:
50 # * replace with soft hyphen when it splits a word (between lowercase letters;
51 # still allows line break when necessary.
52 # * keep, when followed by uppercase letter (e.g. "03-Jährige", "PIN-Nummer")
53 # * remove in any other case (was: forced hyphenation due to space constraints)
54 # ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way
55 # due to single line regex. INFO: Underscore is in DE-localized teletext charset.
56 $text =~ s/([[:lower:]])-_EOL_ ([[:lower:]])/\1­\2/g;
57 $text =~ s/([[:alnum:]])-_EOL_ ([[:upper:]])/\1-\2/g;
60 # remove ORFText idiosyncrasies
61 $text =~ s/([[:alnum:]]),([[:alnum:]])/\1, \2/g; # no space after comma to save space
62 $text =~ s/([[:alnum:]])\.([[:upper:][:digit:]])/\1. \2/g; # no space after period to save space (WARN: breaks URLS like tirol.ORF.at)
64 # adblocker: (keep it 7bit-ASCII; perl processes latin1, but output will be utf8)
65 $text =~ s/Kalendarium - t.glich neu \. 734//g;
66 $text =~ s/>>tirol\. ?ORF\.at//g;
68 my @tmp = split(' ',$meta{'date'});
69 my $shortdate = substr $tmp[1], 0, 6;
70 my $pagesubpage = $meta{'page'} . ($subpage > 0?
".$subpage":"");
71 my $moreinfo = "$meta{'res'} - $meta{'subres'}; $meta{'date'}";
72 print "<p>$pagesubpage:</span> <b title='$moreinfo' onclick='javascript:alert(\"$moreinfo\")'>$title</b><br>$text</p>";
77 my @elems = split ' ', @_[0];
80 'page
' => shift @elems,
81 'channel
' => shift @elems,
82 'date
' => join (' ', @elems)