]>
git.gir.st - ttxd.git/blob - html.pl
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts hypertext formatted news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
8 # Usage: ./plain.pl <VTX-file>
14 binmode STDOUT
, ":encoding(utf8)" ;
40 my $subp = 0 ; #TODO: allow subpages
41 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
42 open ( VTX
, "./vtx2ascii -a $page |" ) || die ( "Can'r run vtx2ascii" );
46 # transliterate from ETSI EN 300 706 G0 German to Latin-1 (will be converted to UTF-8 by perl):
47 tr/[\\]{|}~/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}/ ;
49 $line =~ s/^\s+|\s+$//g ;
53 when ( 1 ) { %meta = parse_metadata
( $line ) ; $is_10x = ( $meta { 'page' }< 110 ) }
54 when ( 2 ) { $meta { 'subres' } = $line }
55 when ( 3 ) { $meta { 'res' } = $line }
57 when ( 5 + ( 1 * $is_10x )) { $title = $line }
58 when ( 4 + ( 1 * $is_10x )) {}
59 when ( 4 + ( 3 * $is_10x )) { $title .= $line }
60 default { $text .= $last . "_EOL_" . ( $last eq "" ?
"" :( $line eq "" ?
"<br>" : " " )) }
62 $last = $line unless $. == ( 5 +( 2 * $is_10x ));
66 #remove hyphenation at original line ending only when in between lowercase letters, replace with soft hyphen to still allow hyphenation when needed. ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way due to single line regex.
67 $text =~ s/([[:lower:]])-_EOL_ ([[:lower:]])/\1­\2/g ;
70 # adblocker: just add more regexes
71 $text =~ s/Kalendarium - t.glich neu \. 734//g ;
73 print "<p> $meta {'page'}: <b> $title </b><br> $text </p>" ;
78 my @elems = split ' ' , @_ [ 0 ];
81 'page' => shift @elems,
82 'channel' => shift @elems,
83 'date' => join ( ' ' , @elems )