]> git.gir.st - ttxd.git/blob - ORFText.pm
publish (privately developed) version 2
[ttxd.git] / ORFText.pm
1 package ORFText;
2 our $VERSION = '2.00';
3
4 =pod Project: ORFText.pm
5 - NOW: a module returning data structure with inline-html
6 - LATER: instead of returning html return a syntax tree
7 html tag insertion is limited to {{{text mangling}}}
8 - html_escape # HTML
9 - emphasize # HTML
10 - rehyphenate # UNICODE
11 - linebreak # HTML
12 - $ORF_idio_slash (negative lookahead against </foo>)
13
14 TODO - URL references (">>tirol.ORF.at")
15 (C) 2016-2019 Tobias Girstmair
16 Extracts hypertext formatted news from ORF Teletext
17 reads and decodes pages from dvbtext's spool directory.
18 =cut
19
20 use 5.010;
21 use strict;
22 use warnings;
23 use utf8;
24 use Data::Dumper;
25
26 use base 'Exporter';
27 our @EXPORT = qw(html weather $TABLE_YES $REF_MARKUP);
28 our $TABLE_YES = 0; # enable that to parse tables w/ vtx2ascii
29 our $REF_MARKUP = sub { return "<u>$_[0]<b>$_[1]</b></u>"};
30
31 ### i/o {{{
32 use constant {
33 VTX_HEADER => 12,
34 TTX_HEADER => 8,
35 STATUS_BAR => 32,
36 };
37 sub slurp_lines { my ($file) = @_;
38 open VTX, "<:raw", "$file" or die ("Can'r open $file");
39
40 # read page number:
41 seek VTX, VTX_HEADER+TTX_HEADER, 0;
42 read VTX, my $pagenum, 3;
43 $pagenum &= "\x7f" x 3; # zero out parity bit
44
45 # read page content:
46 seek VTX, VTX_HEADER+TTX_HEADER+STATUS_BAR, 0;
47 read VTX, my $raw_text, 40*23;
48 $raw_text &= ("\x7f" x length $raw_text); # zero out parity bit
49 # NOTE: unpack chomps strings when using the 'A' template.
50 my @lines = map {tr/[\\]{|}~`@/ÄÖÜäöüß°§/r} unpack 'A40'x23, $raw_text;
51
52 close VTX;
53 my ($subpage) = $file=~m/\d{3}_(\d{2})\.vtx/;
54 return ($pagenum, $subpage, @lines);
55 }
56 ### }}}
57 ### Teletext Grammar {{{
58 # control characters defined in ETSI EN 300 706 (2003) 12.2 Table 26
59 my $ORF_TTX_GRAMMAR = qr {
60 (?(DEFINE)
61 (?<cntrl>
62 (?>(?&a_color)) | (?>(?&g_color))
63 | (?>(?&flash)) | (?>(?&steady))
64 | (?>(?&endbox)) | (?>(?&startbox))
65 | (?>(?&n_size)) | (?>(?&d_height))
66 | (?>(?&d_width)) | (?>(?&d_size))
67 | (?>(?&conceal)) | (?>(?&esc))
68 | (?>(?&g_cont)) | (?>(?&g_sep))
69 | (?>(?&bg_black)) | (?>(?&bg_new))
70 | (?>(?&g_hold)) | (?>(?&g_release))
71 | (?>(?&fake))
72 )
73 (?<a_color>
74 (?>(?&a_blk))
75 | (?>(?&a_red))
76 | (?>(?&a_grn))
77 | (?>(?&a_ylw))
78 | (?>(?&a_blu))
79 | (?>(?&a_mgt))
80 | (?>(?&a_cya))
81 | (?>(?&a_wht))
82 )
83 (?<g_color>
84 (?>(?&g_blk))
85 | (?>(?&g_red))
86 | (?>(?&g_grn))
87 | (?>(?&g_ylw))
88 | (?>(?&g_blu))
89 | (?>(?&g_mgt))
90 | (?>(?&g_cya))
91 | (?>(?&g_wht))
92 )
93 (?<a_blk> \x00) (?<g_blk> \x10) (?<blk> (?&a_blk)|(?&g_blk))
94 (?<a_red> \x01) (?<g_red> \x11) (?<red> (?&a_red)|(?&g_red))
95 (?<a_grn> \x02) (?<g_grn> \x12) (?<grn> (?&a_grn)|(?&g_grn))
96 (?<a_ylw> \x03) (?<g_ylw> \x13) (?<ylw> (?&a_ylw)|(?&g_ylw))
97 (?<a_blu> \x04) (?<g_blu> \x14) (?<blu> (?&a_blu)|(?&g_blu))
98 (?<a_mgt> \x05) (?<g_mgt> \x15) (?<mgt> (?&a_mgt)|(?&g_mgt))
99 (?<a_cya> \x06) (?<g_cya> \x16) (?<cya> (?&a_cya)|(?&g_cya))
100 (?<a_wht> \x07) (?<g_wht> \x17) (?<wht> (?&a_wht)|(?&g_wht))
101 (?<flash> \x08) (?<steady> \x09)
102 (?<endbox> \x0a) (?<startbox> \x0b)
103 (?<n_size> \x0c) (?<d_height> \x0d)
104 (?<d_width> \x0e) (?<d_size> \x0f)
105 (?<conceal> \x18)
106 (?<g_cont> \x19) (?<g_sep> \x1a)
107 (?<esc> \x1b) # switch G0 charset
108 (?<bg_black> \x1c) (?<bg_new> \x1d)
109 (?<g_hold> \x1e) (?<g_release> \x1f)
110
111 (?<ws> (?&cntrl)|[ ])
112 (?<_> (?&ws)*)
113 (?<WHT> (?&a_wht)|(?&g_wht)|[ ])
114 (?<S> \A ^(?&_))
115 (?<E> (?&_)$ \Z)
116
117 # graphics characters:
118 (?<g_34> \x2c) # middle row (sixel 2^3 and 2^4)
119
120 # fake ctrlchar to mark up subheadings with emphasize():
121 (?<fake> \x7f)
122 )
123 }xms;
124 # ORF Specific Shortcuts:
125 # a. page parsing:
126 my $ORF_10x_title = qr/(?&S)(?&WHT)(?&bg_new)((?&a_red)(?&_)[^\x00]+?)(?&E)$ORF_TTX_GRAMMAR/;
127 my $ORF_10x_text_1 = qr/(?&S)(?&WHT)(?&bg_new)[^\x01](.+?)(?&E)$ORF_TTX_GRAMMAR/; # text variant 1: non-emph. beginning
128 my $ORF_10x_text_2 = qr/(?&S)(?&WHT)(?&bg_new)((?&a_red).*(?&a_blk).*)(?&E)$ORF_TTX_GRAMMAR/; # text variant 2: non-whole-line-emph. at beginning
129 my $ORF_10x_text = qr/(?|$ORF_10x_text_1|$ORF_10x_text_2)/; # combine with "branch reset" pattern
130 my $ORF_11x_title = qr/(?&S)((?&a_cya)[^\x07]+?)(?&E)$ORF_TTX_GRAMMAR/;
131 my $ORF_11x_subtitle = qr/^(?&a_ylw)([^\x07]+?)(?&E)$ORF_TTX_GRAMMAR/; # yellow subheadings (not inline-yellow)
132 my $ORF_11x_text_1 = qr/^(?&WHT)([^\x1d].+)(?&E)$ORF_TTX_GRAMMAR/; # text variant 1: starts with non-emphasised word (\x1d: don't match subres_2)
133 my $ORF_11x_text_2 = qr/^((?&a_cya).*(?&a_wht).*)(?&E)$ORF_TTX_GRAMMAR/; # text variant 2: starts with non-full-length emphasis
134 my $ORF_11x_text_3 = qr/^(?&g_red)(?&g_34)+((?&a_cya).+?)(?:(?&g_red)(?&g_34)+|$)$ORF_TTX_GRAMMAR/; # text variant 3: very last line (red band below 11x pages)
135 my $ORF_11x_text_xtra= qr/^((?:(?&a_ylw)|(?&a_grn)).*(?:(?&a_wht).*)?)(?&E)$ORF_TTX_GRAMMAR/; # hardly used; sort of paragraph heading (->121-20190526-greentext.vtx)
136 my $ORF_11x_text = qr/(?|$ORF_11x_text_1|$ORF_11x_text_2|$ORF_11x_text_3)/; # combines the regexes above into a single match group; (?|) resets the backref-number
137 my $ORF_70x_subres = qr/(?&S)(?&a_wht)(?&bg_new)(?&a_red)(?&_)(.+)(?&_)(?&bg_black)(?&E)$ORF_TTX_GRAMMAR/;
138 my $ORF_subressort = qr/(?&S)(.*?)(?&E)$ORF_TTX_GRAMMAR/;
139 my $ORF_ressort_topic= qr/(?&S)(?&a_wht)(?&_)(.+?)(?&_)(?&bg_new)(?&_)(.*?)(?&_)(?&bg_black)?(?&E)$ORF_TTX_GRAMMAR/; # topic=fallback title
140 my $ORF_emptyline = qr/^(?&ws)*$ORF_TTX_GRAMMAR$/;
141 my $ORF_advert = qr/^[\001\002\004](?&bg_new)(?:[\000\001\007](?!(?&cntrl))|[\003\a]\r|\r\0)(?&_)(.+)(?&_)$ORF_TTX_GRAMMAR/;
142 # b. reference and emphasis matching:
143 my $ORF_ref_name_1 = qr/([^\|]*?>+ ?(?:S. ?)?)/; # "Hofer >", "Opposition > S. "
144 my $ORF_ref_name_2 = qr/(>+[^\|]*?)/; # ">Platter "
145 my $ORF_ref_name = qr/(?|$ORF_ref_name_1|$ORF_ref_name_2)/;
146 my $ORF_ref_nums = qr/(\d{3}(?:[-\/]\d{3})?)/; # "113-116", "127/128", "115"
147 my $ORF_reference = qr/(?:(?&a_red)|(?&a_cya)) *$ORF_ref_name$ORF_ref_nums(?&_)(?:\||$)$ORF_TTX_GRAMMAR/;
148 my $ORF_10x_emph = qr/(?&a_red)(.*?)(?:(?&a_blk)|(\|)(?! ?(?&a_red))|$)$ORF_TTX_GRAMMAR/; # 10x: red-on-white
149 my $ORF_11x_emph_y = qr/(?&a_ylw)(.*?)(?:(?&a_wht)|(\|)(?! ?(?&a_ylw))|$)$ORF_TTX_GRAMMAR/; # 11x: yellow-on-black
150 my $ORF_11x_emph_c = qr/(?&a_cya)(.*?)(?:(?&a_wht)|(\|)(?! ?(?&a_cya))|$)$ORF_TTX_GRAMMAR/; # 11x: cyan-on-black
151 my $ORF_11x_emph_g = qr/(?&a_grn)(.*?)(?:(?&a_wht)|(\|)(?! ?(?&a_grn))|$)$ORF_TTX_GRAMMAR/; # 11x: green-on-black
152 my $ORF_emphasis = qr/(?|$ORF_10x_emph|$ORF_11x_emph_y|$ORF_11x_emph_c|$ORF_11x_emph_g)/;
153 my $ORF_subtitle = qr/(?&fake)(.+?)\|$ORF_TTX_GRAMMAR/; # uses fake ctrlchar to differentiate from yellow-emph
154 # c. rehyphenation:
155 my $ORF_hy_ergaenz = qr/(\b\w+\b)-\| ?\b(und|oder)\b ?/; # (Ergänzungsstrich, e.g. "Staats- und Regierungschefs")
156 my $ORF_hy_trenn = qr/([[:lower:]])-\| ?([[:lower:]])/; # (Trennstrich)
157 my $ORF_hy_binde = qr/(\S)-\| ?(\S)/; # (Bindestrich, e.g. "Mikl-Leitner", "30-jaehriges", etc.)
158 my $ORF_hy_gedanken = qr/[ |]-[ |]/; # (Gedankenstrich)
159 # d. unsave space / idiosyncrasies:
160 my $ORF_idio_comma = qr/(?|([[:alnum:])"]),([[:alpha:]])|([[:alpha:])"]),([[:alnum:]]))/; # comma: not between 5,4%
161 my $ORF_idio_period = qr/(?|([[:alnum:])"])\.([[:upper:]])|([[:alpha:])"])\.([[:digit:]]))/; # period: not between 1.000.000, www.foo.org
162 my $ORF_idio_URL = qr/(\S+)\. (ORF\.at)/; # special case for e.g. tirol.ORF.at
163 my $ORF_idio_colon = qr/(?|([[:alnum:]]):([[:alpha:]])|([[:alpha:]]):([[:alnum:]]))/; # colon: not between 1:0
164 my $ORF_idio_slash = qr/( ?)(?!<)\/( ?)/; # fix foo/bar at end of line ("ÖVP/ FPÖ"->"ÖVP/FPÖ"); negative lookahead to not match </foo> from emphasize()
165 # e. trimming, better-line-breaks, misc.:
166 my $ORF_ws = qr/(?&ws)$ORF_TTX_GRAMMAR/; # Note: control characters are rendered as whitespace; included as well
167 my $ORF_is_table = qr/[\x21-\x7e]((?&a_color)|[ ]){2,}[\x21-\x7e]|^\x11\x2c+\x06Liga-Start:$ORF_TTX_GRAMMAR/; # Warn: breaks when line3 is "m/n" -- filter beforehand
168 my $ORF_bleedover = qr/(.+?) +(.+)/; # when two-line ressort (left-aligned) bleeds into subressort (right-aligned)
169 # Note: [^\xNN] makes sure each line is only matched by 1 regex
170 # Note: inline markup is terminated by a ctrlchar (?&a_xxx), linebreak (\|) or end of string ($).
171 # negative lookahead after \| allows us to match multiline-emphasis in one go
172 # Note: ORF_hy_* removes linebreaks; have to sanitize lines not ending with a hyphen seperately
173 # Note: is_table: matches tables by alignment-whitespace; 215-219 have 'Liga-Start' in last line
174 # }}}
175 ### text mangling {{{
176 sub emphasize { for (@_) {
177 s{$ORF_reference}{ @{[$REF_MARKUP->($1, $2)]} }g;
178 s{$ORF_subtitle} {<h4>$1</h4>}g;
179 s{$ORF_emphasis} { <i>$1</i> $2}g;
180 }}
181 sub rehyphenate { for (@_) {
182 use charnames ();
183 s{$ORF_hy_ergaenz} {$1- $2 }g;
184 s{$ORF_hy_trenn} {$1\N{SOFT HYPHEN}$2}g;
185 s{$ORF_hy_binde} {$1-$2}g;
186 s{$ORF_hy_gedanken}{ \N{EN DASH} }g;
187 }}
188 sub linebreak { for (@_) {
189 s{\|\|+}{<p>}g; # mark up paragraphs
190 s/(<p>)+$//gs; # remove if last
191 s{\| *}{ }g; # remove other line-markers
192 }}
193 sub unsave_space { for (@_) {
194 s{$ORF_idio_comma} {$1, $2}g;
195 s{$ORF_idio_period}{$1. $2}g;
196 s{$ORF_idio_colon} {$1: $2}g;
197 s{$ORF_idio_slash} {$1/$1}g;
198 s{$ORF_idio_URL} {$1.$2}g;
199 }}
200 sub trim_ws { for (@_) {
201 # Note: also removes cntrl chars
202 s/^$ORF_ws+|$ORF_ws+$//g;
203 s/$ORF_ws+/ /g; # keep a space between words
204 }}
205 sub html_escape { for (@_) {
206 s/&/&amp;/g;
207 s/</&lt;/g;
208 # don't escape '>' or ref.matching breaks!
209 }}
210 =pod "Try to infer linebreaks without empty line"
211 For each line, check if the first word from the next line would fit in the same line
212 This heuristic is not flawless; ignoring short words helps a lot, but isn't perfect either.
213 To match each line and the next word (which are overlapping), forward lookahead is used.
214 =cut
215 sub better_line_breaks {
216 return if $_[0] =~ m/\|\|/; # mostly, only cramped pages (i.e. those without any empty lines) need this treatment.
217 # matches each physical line and the next word:
218 my @lines = $_[0] =~ m/(?=(?:^|\|)(.*?(?:\||$)[\x00-\x20]*?\w+))/sg;
219
220 my @text = split /\|/, $_[0];
221 for (0 .. $#lines) {
222 $text[$_] .= "|" if (length $lines[$_] < 39 and $lines[$_]=~m/\w{6,}$/); # double up "|" to make empty line
223 }
224 $_[0] = join '|', @text;
225 }
226 ### }}}
227
228 sub html { my ($file) = @_;
229 my ($title, $text, $subres, $ressort, $topic);
230 my ($title_2, $text_2, $subres_2);
231 my $advert;
232 my $is_table = 0;
233
234 my ($page, $subpage, @lines) = slurp_lines ($file);
235
236 # 1. parse header:
237 ($subres) = (shift @lines) =~ m{$ORF_subressort};
238 ($ressort, $topic) = (shift @lines) =~ m{$ORF_ressort_topic};
239 ($ressort, $subres) = ("$1 $ressort", $2) if ($subres =~ m/$ORF_bleedover/);
240 $subres = $topic if ($page =~ m/70./); # Bundesländer pages
241 my ($page_x_of_y) = (shift @lines) =~ m{([0-9/]+)}; # will be discarded
242
243 # 2. parse body:
244 for (@lines) {
245 if (/$ORF_emptyline/) {
246 $text .= "|" if defined $text;
247 } elsif (m{$ORF_is_table} and $page =~/2..|7[56]./) {
248 return {tabular=>1, raw=>\@lines, title=>$topic,page=>$page,subpage=>$subpage, ressort=>$ressort,subressort=>$subres};
249 } elsif ($page < 111) {
250 if (m{$ORF_10x_title}) { (defined $text?$text:$title) .= "$1|"; }
251 elsif (m{$ORF_10x_text}) { $text .= "$1|"; }
252 elsif (m{$ORF_advert}) {$advert .= "$1|" if defined $text;} # WARN: this parses ads
253 # NOTE: ads only appear after the text; "ads" before it are category pages, which we don't display at all
254 } elsif ($subres_2) { # second snippet of 70x page
255 if (m{$ORF_11x_title}) { $title_2 .= "$1|"; }
256 elsif (m{$ORF_11x_text}) { $text_2 .= "$1|" }
257 } else {
258 if (m{$ORF_11x_title}) { (defined $text?$text:$title) .= "$1|"; }
259 elsif (m{$ORF_11x_subtitle}) { (length $title?$text:$title) .= "\x7f$1|"; }
260 elsif (m{$ORF_11x_text}) { $text .= "$1|"; }
261 elsif (m{$ORF_11x_text_xtra}){ $text .= "|$1||"; }
262 elsif (m{$ORF_70x_subres}) { $subres_2 = $1; }
263 # TEMP: display other-colored text as normal:
264 elsif (m{^(?&a_color)$ORF_TTX_GRAMMAR}) {$text.="|{ $_ }|";}
265 }
266
267 $is_table += !!(defined $text and m{$ORF_is_table});
268 }
269
270 # 3. post-processing
271 if ($topic =~ /\bPresse zum\b/) {
272 # Pressespiegel: use topic as title; TODO: requires testing
273 $text = "$title $text";
274 $title = $topic;
275 } else {
276 # otherwise: only use topic if no title (6xx has neither)
277 $title //= $topic || "$ressort - $subres";
278 }
279 if ($is_table > 1) { # NOTE: we ignore a 1-off double-spacing mistake (quite common)
280 $text =~ s/(?&cntrl)$ORF_TTX_GRAMMAR/ /g; # remove all markup/colors
281 $text =~ s/[|{}]+/||/g; # force new paragraph for each ttx-line
282 }
283 better_line_breaks ($text); # tries to guess where a line break should be
284
285 html_escape ($text, $title, $advert//(), $text_2//(), $title_2//()); # HTML
286 emphasize ($text, $advert//(), $text_2//()); # HTML
287 trim_ws ($text, $title, $advert//(), $text_2//(), $title_2//());
288 rehyphenate ($text, $title, $advert//(), $text_2//(), $title_2//()); # UNICODE
289 linebreak ($text, $title, $advert//(), $text_2//(), $title_2//()); # HTML
290 unsave_space($text, $title, $advert//(), $text_2//(), $title_2//());
291
292 return {
293 title => $title,
294 text => $text,
295 topic => $topic,
296 page => $page,
297 subpage => defined $subres_2? 'A': 0+($subpage//0),
298 ressort => $ressort,
299 subressort => $subres,
300 advert => $advert,
301 }, defined $subres_2? {
302 title => $title_2,
303 text => $text_2,
304 topic => "",
305 page => $page,
306 subpage => 'B',
307 ressort => $ressort,
308 subressort => $subres_2,
309 }: undef;
310 }
311
312 sub weather { my ($spool, $city) = @_;
313 my $s = qr{[\x00-\x20]+};
314 my $S = qr{[^\x00-\x20]+?};
315 my %weather;
316 for (<$spool/601_01.vtx $spool/6{0[2-9],10}_??.vtx>) {
317 my ($page, $subpage, @lines) = slurp_lines ($_);
318 for (grep /${s}$city/, @lines) {
319 my ($loc, $time, $wetter, $temp) =
320 m/$s(.*?)$s($S)h$s(.*?)$s([-0-9,°]+)$s?/;
321
322 $weather{time} = "$time:00";
323 $weather{location} = $loc;
324 $weather{weather} = $wetter;
325 $weather{temperature} = $temp;
326 return %weather if $temp;
327 }
328 }
329 return undef;
330 };
331 1; # vim:foldmethod=marker
Imprint / Impressum