ORFText.pm

   1 package ORFText;
   2 our $VERSION = '2.00';
   3
   4 =pod Project: ORFText.pm
   5  - NOW: a module returning data structure with inline-html
   6  - LATER: instead of returning html return a syntax tree
   7    html tag insertion is limited to {{{text mangling}}}
   8         - html_escape # HTML
   9         - emphasize   # HTML
  10         - rehyphenate # UNICODE
  11         - linebreak   # HTML
  12         - $ORF_idio_slash (negative lookahead against </foo>)
  13
  14   TODO - URL references (">>tirol.ORF.at")
  15   (C) 2016-2019 Tobias Girstmair
  16   Extracts hypertext formatted news from ORF Teletext
  17   reads and decodes pages from dvbtext's spool directory.
  18 =cut
  19
  20 use 5.010;
  21 use strict;
  22 use warnings;
  23 use utf8;
  24 use Data::Dumper;
  25
  26 use base 'Exporter';
  27 our @EXPORT = qw(html weather $TABLE_YES $REF_MARKUP);
  28 our $TABLE_YES = 0; # enable that to parse tables w/ vtx2ascii
  29 our $REF_MARKUP = sub { return "<u>$_[0]<b>$_[1]</b></u>"};
  30
  31 ### i/o {{{
  32 use constant {
  33         VTX_HEADER => 12,
  34         TTX_HEADER =>  8,
  35         STATUS_BAR => 32,
  36 };
  37 sub slurp_lines { my ($file) = @_;
  38         open VTX, "<:raw", "$file" or die ("Can'r open $file");
  39
  40         # read page number:
  41         seek VTX, VTX_HEADER+TTX_HEADER, 0;
  42         read VTX, my $pagenum, 3;
  43         $pagenum &= "\x7f" x 3; # zero out parity bit
  44
  45         # read page content:
  46         seek VTX, VTX_HEADER+TTX_HEADER+STATUS_BAR, 0;
  47         read VTX, my $raw_text, 40*23;
  48         $raw_text &= ("\x7f" x length $raw_text); # zero out parity bit
  49         # NOTE: unpack chomps strings when using the 'A' template.
  50         my @lines = map {tr/[\\]{|}~`@/ÄÖÜäöüß°§/r} unpack 'A40'x23, $raw_text;
  51
  52         close VTX;
  53         my ($subpage) = $file=~m/\d{3}_(\d{2})\.vtx/;
  54         return ($pagenum, $subpage, @lines);
  55 }
  56 ### }}}
  57 ### Teletext Grammar {{{
  58 # control characters defined in ETSI EN 300 706 (2003) 12.2 Table 26
  59 my $ORF_TTX_GRAMMAR = qr {
  60 (?(DEFINE)
  61     (?<cntrl>
  62         (?>(?&a_color))    | (?>(?&g_color))
  63         | (?>(?&flash))    | (?>(?&steady))
  64         | (?>(?&endbox))   | (?>(?&startbox))
  65         | (?>(?&n_size))   | (?>(?&d_height))
  66         | (?>(?&d_width))  | (?>(?&d_size))
  67         | (?>(?&conceal))  | (?>(?&esc))
  68         | (?>(?&g_cont))   | (?>(?&g_sep))
  69         | (?>(?&bg_black)) | (?>(?&bg_new))
  70         | (?>(?&g_hold))   | (?>(?&g_release))
  71         | (?>(?&fake))
  72     )
  73     (?<a_color>
  74         (?>(?&a_blk))
  75         | (?>(?&a_red))
  76         | (?>(?&a_grn))
  77         | (?>(?&a_ylw))
  78         | (?>(?&a_blu))
  79         | (?>(?&a_mgt))
  80         | (?>(?&a_cya))
  81         | (?>(?&a_wht))
  82     )
  83     (?<g_color>
  84         (?>(?&g_blk))
  85         | (?>(?&g_red))
  86         | (?>(?&g_grn))
  87         | (?>(?&g_ylw))
  88         | (?>(?&g_blu))
  89         | (?>(?&g_mgt))
  90         | (?>(?&g_cya))
  91         | (?>(?&g_wht))
  92     )
  93     (?<a_blk> \x00) (?<g_blk> \x10) (?<blk> (?&a_blk)|(?&g_blk))
  94     (?<a_red> \x01) (?<g_red> \x11) (?<red> (?&a_red)|(?&g_red))
  95     (?<a_grn> \x02) (?<g_grn> \x12) (?<grn> (?&a_grn)|(?&g_grn))
  96     (?<a_ylw> \x03) (?<g_ylw> \x13) (?<ylw> (?&a_ylw)|(?&g_ylw))
  97     (?<a_blu> \x04) (?<g_blu> \x14) (?<blu> (?&a_blu)|(?&g_blu))
  98     (?<a_mgt> \x05) (?<g_mgt> \x15) (?<mgt> (?&a_mgt)|(?&g_mgt))
  99     (?<a_cya> \x06) (?<g_cya> \x16) (?<cya> (?&a_cya)|(?&g_cya))
 100     (?<a_wht> \x07) (?<g_wht> \x17) (?<wht> (?&a_wht)|(?&g_wht))
 101     (?<flash>    \x08) (?<steady>    \x09)
 102     (?<endbox>   \x0a) (?<startbox>  \x0b)
 103     (?<n_size>   \x0c) (?<d_height>  \x0d)
 104     (?<d_width>  \x0e) (?<d_size>    \x0f)
 105     (?<conceal>  \x18)
 106     (?<g_cont>   \x19) (?<g_sep>     \x1a)
 107     (?<esc>      \x1b) # switch G0 charset
 108     (?<bg_black> \x1c) (?<bg_new>    \x1d)
 109     (?<g_hold>   \x1e) (?<g_release> \x1f)
 110
 111     (?<ws> (?&cntrl)|[ ])
 112     (?<_> (?&ws)*)
 113     (?<WHT> (?&a_wht)|(?&g_wht)|[ ])
 114     (?<S> \A ^(?&_))
 115     (?<E> (?&_)$ \Z)
 116
 117     # graphics characters:
 118     (?<g_34> \x2c) # middle row (sixel 2^3 and 2^4)
 119
 120     # fake ctrlchar to mark up subheadings with emphasize():
 121     (?<fake> \x7f)
 122 )
 123 }xms;
 124 # ORF Specific Shortcuts:
 125 # a. page parsing:
 126 my $ORF_10x_title    = qr/(?&S)(?&WHT)(?&bg_new)((?&a_red)(?&_)[^\x00]+?)(?&E)$ORF_TTX_GRAMMAR/;
 127 my $ORF_10x_text_1   = qr/(?&S)(?&WHT)(?&bg_new)[^\x01](.+?)(?&E)$ORF_TTX_GRAMMAR/; # text variant 1: non-emph. beginning
 128 my $ORF_10x_text_2   = qr/(?&S)(?&WHT)(?&bg_new)((?&a_red).*(?&a_blk).*)(?&E)$ORF_TTX_GRAMMAR/; # text variant 2: non-whole-line-emph. at beginning
 129 my $ORF_10x_text     = qr/(?|$ORF_10x_text_1|$ORF_10x_text_2)/; # combine with "branch reset" pattern
 130 my $ORF_11x_title    = qr/(?&S)((?&a_cya)[^\x07]+?)(?&E)$ORF_TTX_GRAMMAR/;
 131 my $ORF_11x_subtitle = qr/^(?&a_ylw)([^\x07]+?)(?&E)$ORF_TTX_GRAMMAR/; # yellow subheadings (not inline-yellow)
 132 my $ORF_11x_text_1   = qr/^(?&WHT)([^\x1d].+)(?&E)$ORF_TTX_GRAMMAR/; # text variant 1: starts with non-emphasised word (\x1d: don't match subres_2)
 133 my $ORF_11x_text_2   = qr/^((?&a_cya).*(?&a_wht).*)(?&E)$ORF_TTX_GRAMMAR/; # text variant 2: starts with non-full-length emphasis
 134 my $ORF_11x_text_3   = qr/^(?&g_red)(?&g_34)+((?&a_cya).+?)(?:(?&g_red)(?&g_34)+|$)$ORF_TTX_GRAMMAR/; # text variant 3: very last line (red band below 11x pages)
 135 my $ORF_11x_text_xtra= qr/^((?:(?&a_ylw)|(?&a_grn)).*(?:(?&a_wht).*)?)(?&E)$ORF_TTX_GRAMMAR/; # hardly used; sort of paragraph heading (->121-20190526-greentext.vtx)
 136 my $ORF_11x_text     = qr/(?|$ORF_11x_text_1|$ORF_11x_text_2|$ORF_11x_text_3)/; # combines the regexes above into a single match group; (?|) resets the backref-number
 137 my $ORF_70x_subres   = qr/(?&S)(?&a_wht)(?&bg_new)(?&a_red)(?&_)(.+)(?&_)(?&bg_black)(?&E)$ORF_TTX_GRAMMAR/;
 138 my $ORF_subressort   = qr/(?&S)(.*?)(?&E)$ORF_TTX_GRAMMAR/;
 139 my $ORF_ressort_topic= qr/(?&S)(?&a_wht)(?&_)(.+?)(?&_)(?&bg_new)(?&_)(.*?)(?&_)(?&bg_black)?(?&E)$ORF_TTX_GRAMMAR/; # topic=fallback title
 140 my $ORF_emptyline    = qr/^(?&ws)*$ORF_TTX_GRAMMAR$/;
 141 my $ORF_advert       = qr/^[\001\002\004](?&bg_new)(?:[\000\001\007](?!(?&cntrl))|[\003\a]\r|\r\0)(?&_)(.+)(?&_)$ORF_TTX_GRAMMAR/;
 142 # b. reference and emphasis matching:
 143 my $ORF_ref_name_1   = qr/([^\|]*?>+ ?(?:S. ?)?)/;  # "Hofer >", "Opposition > S. "
 144 my $ORF_ref_name_2   = qr/(>+[^\|]*?)/;    # ">Platter "
 145 my $ORF_ref_name     = qr/(?|$ORF_ref_name_1|$ORF_ref_name_2)/;
 146 my $ORF_ref_nums     = qr/(\d{3}(?:[-\/]\d{3})?)/; # "113-116", "127/128", "115"
 147 my $ORF_reference    = qr/(?:(?&a_red)|(?&a_cya)) *$ORF_ref_name$ORF_ref_nums(?&_)(?:\||$)$ORF_TTX_GRAMMAR/;
 148 my $ORF_10x_emph     = qr/(?&a_red)(.*?)(?:(?&a_blk)|(\|)(?! ?(?&a_red))|$)$ORF_TTX_GRAMMAR/;  # 10x: red-on-white
 149 my $ORF_11x_emph_y   = qr/(?&a_ylw)(.*?)(?:(?&a_wht)|(\|)(?! ?(?&a_ylw))|$)$ORF_TTX_GRAMMAR/;  # 11x: yellow-on-black
 150 my $ORF_11x_emph_c   = qr/(?&a_cya)(.*?)(?:(?&a_wht)|(\|)(?! ?(?&a_cya))|$)$ORF_TTX_GRAMMAR/;  # 11x: cyan-on-black
 151 my $ORF_11x_emph_g   = qr/(?&a_grn)(.*?)(?:(?&a_wht)|(\|)(?! ?(?&a_grn))|$)$ORF_TTX_GRAMMAR/;  # 11x: green-on-black
 152 my $ORF_emphasis     = qr/(?|$ORF_10x_emph|$ORF_11x_emph_y|$ORF_11x_emph_c|$ORF_11x_emph_g)/;
 153 my $ORF_subtitle     = qr/(?&fake)(.+?)\|$ORF_TTX_GRAMMAR/; # uses fake ctrlchar to differentiate from yellow-emph
 154 # c. rehyphenation:
 155 my $ORF_hy_ergaenz   = qr/(\b\w+\b)-\| ?\b(und|oder)\b ?/; # (Ergänzungsstrich, e.g. "Staats- und Regierungschefs")
 156 my $ORF_hy_trenn     = qr/([[:lower:]])-\| ?([[:lower:]])/; # (Trennstrich)
 157 my $ORF_hy_binde     = qr/(\S)-\| ?(\S)/; # (Bindestrich, e.g. "Mikl-Leitner", "30-jaehriges", etc.)
 158 my $ORF_hy_gedanken  = qr/[ |]-[ |]/; # (Gedankenstrich)
 159 # d. unsave space / idiosyncrasies:
 160 my $ORF_idio_comma   = qr/(?|([[:alnum:])"]),([[:alpha:]])|([[:alpha:])"]),([[:alnum:]]))/;  # comma: not between 5,4%
 161 my $ORF_idio_period  = qr/(?|([[:alnum:])"])\.([[:upper:]])|([[:alpha:])"])\.([[:digit:]]))/;  # period: not between 1.000.000, www.foo.org
 162 my $ORF_idio_URL     = qr/(\S+)\. (ORF\.at)/;  # special case for e.g. tirol.ORF.at
 163 my $ORF_idio_colon   = qr/(?|([[:alnum:]]):([[:alpha:]])|([[:alpha:]]):([[:alnum:]]))/;  # colon: not between 1:0
 164 my $ORF_idio_slash   = qr/( ?)(?!<)\/( ?)/; # fix foo/bar at end of line ("ÖVP/ FPÖ"->"ÖVP/FPÖ"); negative lookahead to not match </foo> from emphasize()
 165 # e. trimming, better-line-breaks, misc.:
 166 my $ORF_ws           = qr/(?&ws)$ORF_TTX_GRAMMAR/;  # Note: control characters are rendered as whitespace; included as well
 167 my $ORF_is_table     = qr/[\x21-\x7e]((?&a_color)|[ ]){2,}[\x21-\x7e]|^\x11\x2c+\x06Liga-Start:$ORF_TTX_GRAMMAR/; # Warn: breaks when line3 is "m/n" -- filter beforehand
 168 my $ORF_bleedover    = qr/(.+?)  +(.+)/;  # when two-line ressort (left-aligned) bleeds into subressort (right-aligned)
 169 # Note: [^\xNN] makes sure each line is only matched by 1 regex
 170 # Note: inline markup is terminated by a ctrlchar (?&a_xxx), linebreak (\|) or end of string ($).
 171 #       negative lookahead after \| allows us to match multiline-emphasis in one go
 172 # Note: ORF_hy_* removes linebreaks; have to sanitize lines not ending with a hyphen seperately
 173 # Note: is_table: matches tables by alignment-whitespace; 215-219 have 'Liga-Start' in last line
 174 # }}}
 175 ### text mangling {{{
 176 sub emphasize { for (@_) {
 177         s{$ORF_reference}{ @{[$REF_MARKUP->($1, $2)]} }g;
 178         s{$ORF_subtitle} {<h4>$1</h4>}g;
 179         s{$ORF_emphasis} { <i>$1</i> $2}g;
 180 }}
 181 sub rehyphenate { for (@_) {
 182         use charnames ();
 183         s{$ORF_hy_ergaenz} {$1- $2 }g;
 184         s{$ORF_hy_trenn}   {$1\N{SOFT HYPHEN}$2}g;
 185         s{$ORF_hy_binde}   {$1-$2}g;
 186         s{$ORF_hy_gedanken}{ \N{EN DASH} }g;
 187 }}
 188 sub linebreak { for (@_) {
 189         s{\|\|+}{<p>}g;   # mark up paragraphs
 190         s/(<p>)+$//gs;    # remove if last
 191         s{\| *}{ }g;        # remove other line-markers
 192 }}
 193 sub unsave_space { for (@_) {
 194         s{$ORF_idio_comma} {$1, $2}g;
 195         s{$ORF_idio_period}{$1. $2}g;
 196         s{$ORF_idio_colon} {$1: $2}g;
 197         s{$ORF_idio_slash} {$1/$1}g;
 198         s{$ORF_idio_URL}   {$1.$2}g;
 199 }}
 200 sub trim_ws { for (@_) {
 201         # Note: also removes cntrl chars
 202         s/^$ORF_ws+|$ORF_ws+$//g;
 203         s/$ORF_ws+/ /g; # keep a space between words
 204 }}
 205 sub html_escape { for (@_) {
 206         s/&/&amp;/g;
 207         s/</&lt;/g;
 208         # don't escape '>' or ref.matching breaks!
 209 }}
 210 =pod "Try to infer linebreaks without empty line"
 211     For each line, check if the first word from the next line would fit in the same line
 212     This heuristic is not flawless; ignoring short words helps a lot, but isn't perfect either.
 213     To match each line and the next word (which are overlapping), forward lookahead is used.
 214 =cut
 215 sub better_line_breaks {
 216         return if $_[0] =~ m/\|\|/; # mostly, only cramped pages (i.e. those without any empty lines) need this treatment.
 217         # matches each physical line and the next word:
 218         my @lines = $_[0] =~ m/(?=(?:^|\|)(.*?(?:\||$)[\x00-\x20]*?\w+))/sg;
 219
 220         my @text = split /\|/, $_[0];
 221         for (0 .. $#lines) {
 222                 $text[$_] .= "|" if (length $lines[$_] < 39 and $lines[$_]=~m/\w{6,}$/); # double up "|" to make empty line
 223         }
 224         $_[0] = join '|', @text;
 225 }
 226 ### }}}
 227
 228 sub html { my ($file) = @_;
 229         my ($title, $text, $subres, $ressort, $topic);
 230         my ($title_2, $text_2, $subres_2);
 231         my $advert;
 232         my $is_table = 0;
 233
 234         my ($page, $subpage, @lines) = slurp_lines ($file);
 235
 236         # 1. parse header:
 237         ($subres) = (shift @lines) =~ m{$ORF_subressort};
 238         ($ressort, $topic) = (shift @lines) =~ m{$ORF_ressort_topic};
 239         ($ressort, $subres) = ("$1 $ressort", $2) if ($subres =~ m/$ORF_bleedover/);
 240         $subres = $topic if ($page =~ m/70./); # Bundesländer pages
 241         my ($page_x_of_y) = (shift @lines) =~ m{([0-9/]+)}; # will be discarded
 242
 243         # 2. parse body:
 244         for (@lines) {
 245                 if (/$ORF_emptyline/) {
 246                         $text .= "|" if defined $text;
 247                 } elsif (m{$ORF_is_table} and $page =~/2..|7[56]./) {
 248                         return {tabular=>1, raw=>\@lines, title=>$topic,page=>$page,subpage=>$subpage, ressort=>$ressort,subressort=>$subres};
 249                 } elsif ($page < 111) {
 250                         if    (m{$ORF_10x_title}) { (defined $text?$text:$title) .= "$1|"; }
 251                         elsif (m{$ORF_10x_text})  { $text .= "$1|"; }
 252                         elsif (m{$ORF_advert})   {$advert .= "$1|" if defined $text;} # WARN: this parses ads
 253                         # NOTE: ads only appear after the text; "ads" before it are category pages, which we don't display at all
 254                 } elsif ($subres_2) { # second snippet of 70x page
 255                         if    (m{$ORF_11x_title}) { $title_2 .= "$1|"; }
 256                         elsif (m{$ORF_11x_text})  { $text_2 .= "$1|" }
 257                 } else {
 258                         if    (m{$ORF_11x_title})    { (defined $text?$text:$title) .= "$1|"; }
 259                         elsif (m{$ORF_11x_subtitle}) { (length $title?$text:$title) .= "\x7f$1|"; }
 260                         elsif (m{$ORF_11x_text})     { $text .= "$1|"; }
 261                         elsif (m{$ORF_11x_text_xtra}){ $text .= "|$1||"; }
 262                         elsif (m{$ORF_70x_subres})   { $subres_2 = $1; }
 263                         # TEMP: display other-colored text as normal:
 264                         elsif (m{^(?&a_color)$ORF_TTX_GRAMMAR}) {$text.="|{ $_ }|";}
 265                 }
 266
 267                 $is_table += !!(defined $text and m{$ORF_is_table});
 268         }
 269
 270         # 3. post-processing
 271         if ($topic =~ /\bPresse zum\b/) {
 272                 # Pressespiegel: use topic as title; TODO: requires testing
 273                 $text = "$title $text";
 274                 $title = $topic;
 275         } else {
 276                 # otherwise: only use topic if no title (6xx has neither)
 277                 $title //= $topic || "$ressort - $subres";
 278         }
 279         if ($is_table > 1) {  # NOTE: we ignore a 1-off double-spacing mistake (quite common)
 280                 $text =~ s/(?&cntrl)$ORF_TTX_GRAMMAR/ /g; # remove all markup/colors
 281                 $text =~ s/[|{}]+/||/g; # force new paragraph for each ttx-line
 282         }
 283         better_line_breaks ($text); # tries to guess where a line break should be
 284
 285         html_escape ($text, $title, $advert//(), $text_2//(), $title_2//());      # HTML
 286         emphasize   ($text,         $advert//(), $text_2//());                    # HTML
 287         trim_ws     ($text, $title, $advert//(), $text_2//(), $title_2//());
 288         rehyphenate ($text, $title, $advert//(), $text_2//(), $title_2//());      # UNICODE
 289         linebreak   ($text, $title, $advert//(), $text_2//(), $title_2//());      # HTML
 290         unsave_space($text, $title, $advert//(), $text_2//(), $title_2//());
 291
 292         return {
 293                 title => $title,
 294                 text  => $text,
 295                 topic => $topic,
 296                 page  => $page,
 297                 subpage => defined $subres_2? 'A': 0+($subpage//0),
 298                 ressort => $ressort,
 299                 subressort => $subres,
 300                 advert => $advert,
 301             }, defined $subres_2? {
 302                 title => $title_2,
 303                 text  => $text_2,
 304                 topic => "",
 305                 page  => $page,
 306                 subpage => 'B',
 307                 ressort => $ressort,
 308                 subressort => $subres_2,
 309         }: undef;
 310 }
 311
 312 sub weather { my ($spool, $city) = @_;
 313         my $s = qr{[\x00-\x20]+};
 314         my $S = qr{[^\x00-\x20]+?};
 315         my %weather;
 316         for (<$spool/601_01.vtx $spool/6{0[2-9],10}_??.vtx>) {
 317                 my ($page, $subpage, @lines) = slurp_lines ($_);
 318                 for (grep /${s}$city/, @lines) {
 319                         my ($loc, $time, $wetter, $temp) =
 320                           m/$s(.*?)$s($S)h$s(.*?)$s([-0-9,°]+)$s?/;
 321
 322                         $weather{time} = "$time:00";
 323                         $weather{location} = $loc;
 324                         $weather{weather} = $wetter;
 325                         $weather{temperature} = $temp;
 326                         return %weather if $temp;
 327                 }
 328         }
 329         return undef;
 330 };
 331 1; # vim:foldmethod=marker