]>
git.gir.st - ttxd.git/blob - html.pl
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts hypertext formatted news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
8 # Usage: ./plain.pl <VTX-file>
14 # convert to utf-8 before outputting; WARN: still processes latin-1 (regexes too)!
15 binmode STDOUT
, ":encoding(utf8)";
21 my ($subpage) = $page=~m/\d{3}_
(\d
{2})/; #requires ppp_ss file name scheme! @parens: https://stackoverflow.com/a/10034105
22 $subpage += 0; # convert to number to remove leading zero
23 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
24 open (VTX
, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii");
28 # transliterate from ETSI EN 300 706 G0 German to latin-1 (AOUaouBoS):
29 tr/[\\]{|}~`@/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}\N{U+B0}\N{U+A7}/;
31 $line =~ s/^\s+|\s+$//g;
35 when (1) { %meta = parse_metadata
($line) ; $is_10x = ($meta{'page'}<110) }
36 when (2) { $meta{'subres'} = $line }
37 when (3) { $meta{'res'} = $line }
39 when (5 + (1*$is_10x)) { $title = $line }
40 when (4 + (1*$is_10x)) {}
41 when (4 + (3*$is_10x)) { $title .=$line if ($title eq "")}
42 default { $text .= $last . "_EOL_" . ($last eq ""?
"":($line eq ""?
"<br>":" ")) }
44 $last = $line unless $. == (5+(2*$is_10x));
48 # substitute hyphenation:
49 # * replace with soft hyphen when it splits a word (between lowercase letters;
50 # still allows line break when necessary.
51 # * keep, when followed by uppercase letter (e.g. "PIN-Nummer")
52 # * keep, when after a digit (e.g "30-jaehriges")
53 # * remove in any other case (was: forced hyphenation due to space constraints)
54 # ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way
55 # due to single line regex. INFO: Underscore is in DE-localized teletext charset.
56 # ad s/und/UND/: otherwise, "foo- und barbaz" will become "foound barbaz"
57 $text =~ s/\bund\b/_UND_/g;
58 $text =~ s/([[:lower:]])-_EOL_ ([[:lower:]])/\1­\2/g;
59 $text =~ s/([[:alnum:]])-_EOL_ ([[:upper:]])/\1-\2/g;
60 $text =~ s/([[:digit:]])-_EOL_ ([[:alnum:]])/\1-\2/g;
61 $text =~ s/_UND_/und/g;
64 # remove ORFText idiosyncrasies
65 $text =~ s/([[:alnum:]]),([[:alpha:]])/\1, \2/g; # no space after comma to save space...
66 $text =~ s/([[:alpha:]]),([[:alnum:]])/\1, \2/g; # ...but not between numbers
67 $text =~ s/([[:alnum:]])\.([[:upper:]])/\1. \2/g; # no space after period to save space... (WARN: breaks URLS like tirol.ORF.at)
68 $text =~ s/([[:alpha:]])\.([[:upper:][:digit:]])/\1. \2/g; # ...but not between numbers
69 $text =~ s/([[:alnum:]]):([[:alnum:]])/\1: \2/g; # no space after colon to save space... TODO: doesn't work
71 # adblocker: (keep it 7bit-ASCII; perl processes latin1, but this script is utf-8 encoded (output will be utf8 due to `binmode` at beginning))
72 $text =~ s/Kalendarium - t.glich neu \. 734//g;
73 $text =~ s/>>tirol\. ?ORF\.at//g;
75 my @tmp = split(' ',$meta{'date'});
76 my $shortdate = substr $tmp[1], 0, 6;
77 my $pagesubpage = $meta{'page'} . ($subpage > 0?
".$subpage":"");
78 my $moreinfo = "$meta{'res'} - $meta{'subres'}; $meta{'date'}";
79 print "<p>$pagesubpage:</span> <b title='$moreinfo'>$title</b><br>$text</p>";
84 my @elems = split ' ', @_[0];
87 'page' => shift @elems,
88 'channel' => shift @elems,
89 'date' => join (' ', @elems)