]> git.gir.st - ttxd.git/blob - html.pl
noted that dvbtext and vtx2ascii had been modified
[ttxd.git] / html.pl
1 #!/usr/bin/perl -X
2
3 # (C) 2016-2017 Tobias Girstmair
4 # Extracts hypertext formatted news from ORF Teletext
5 # uses a modified version of vtx2ascii to decode pages
6 # from dvbtext's spool directory.
7
8 # Usage: ./plain.pl <VTX-file>
9 # Output: HTML
10
11 use strict;
12 use warnings;
13 use 5.010;
14 binmode STDOUT, ":encoding(utf8)";
15
16 # Seitenformat:
17 # 100-109:
18 # Metadaten: 1
19 # Subressort: 2
20 # Ressort/Sparte: 3
21 # Leer 4
22 # Related: 5
23 # Leer 6
24 # Titel: 7
25 # Text: 8-24
26 #
27 # 112-899:
28 # Metadaten: 1
29 # Subressort: 2
30 # Ressort/Sparte: 3
31 # Leer 4
32 # Titel: 5
33 # Text: 6-24
34 #
35
36 my %meta;
37 my $title;
38 my $text = "";
39 my $page = shift;
40 my $subp = 0; #TODO: allow subpages
41 # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements)
42 open (VTX, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii");
43 my $last = "";
44 my $is_10x = 0;
45 do {
46 # transliterate from ETSI EN 300 706 G0 German to Latin-1 (will be converted to UTF-8 by perl):
47 tr/[\\]{|}~/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}/;
48 my $line = $_;
49 $line =~ s/^\s+|\s+$//g;
50 chomp ($line);
51
52 given ($.) {
53 when (1) { %meta = parse_metadata ($line) ; $is_10x = ($meta{'page'}<110) }
54 when (2) { $meta{'subres'} = $line }
55 when (3) { $meta{'res'} = $line }
56 when (4) {}
57 when (5 + (1*$is_10x)) { $title = $line }
58 when (4 + (1*$is_10x)) {}
59 when (4 + (3*$is_10x)) { $title .=$line }
60 default { $text .= $last . "_EOL_" . ($last eq ""?"":($line eq ""?"<br>":" ")) }
61 }
62 $last = $line unless $. == (5+(2*$is_10x));
63 } while (<VTX>);
64 $text .= $last;
65
66 #remove hyphenation at original line ending only when in between lowercase letters, replace with soft hyphen to still allow hyphenation when needed. ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way due to single line regex.
67 $text =~ s/([[:lower:]])-_EOL_ ([[:lower:]])/\1&shy;\2/g;
68 $text =~ s/_EOL_//g;
69
70 # adblocker: just add more regexes
71 $text =~ s/Kalendarium - t.glich neu \. 734//g;
72
73 print "<p>$meta{'page'}: <b>$title</b><br>$text</p>";
74
75 close (VTX);
76
77 sub parse_metadata {
78 my @elems = split ' ', @_[0];
79
80 my %retval = (
81 'page' => shift @elems,
82 'channel' => shift @elems,
83 'date' => join (' ', @elems)
84 );
85
86 return %retval;
87 }
Imprint / Impressum