From a7cf837d16b5421c770eb9d4b3d6e8af78d4eb0b Mon Sep 17 00:00:00 2001 From: girst Date: Fri, 26 May 2017 21:32:07 +0200 Subject: [PATCH] remove some edge cases in html.pl --- ORFText.cgi | 22 +++++++++++++++++++--- html.pl | 53 ++++++++++++++++++++++++++--------------------------- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/ORFText.cgi b/ORFText.cgi index 475f9ed..144b5a2 100755 --- a/ORFText.cgi +++ b/ORFText.cgi @@ -4,14 +4,30 @@ use strict; use warnings; use 5.010; -my @pages = (101, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 127, 128, 129, 130, 131, 132, 133, 134, 102, 136, 137, 138, 139, 140, 141, 142, 143, 108, 461, 462, 463, 464, 465); +my @pages = (101, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 127, 128, 129, 130, 131, 132, 133, 134, 102, 136, 137, 138, 139, 140, 141, 142, 143, 706, 108, 461, 462, 463, 464, 465); print "Content-type: text/html\n\n"; -print "ORFText News"; +print "ORFText News"; print "

ORFText News

"; +print 'pol | loc [T] | web'; foreach (@pages) { - print `./html.pl /run/ttxd/spool/2/${_}_00.vtx` if -e "/run/ttxd/spool/2/${_}_00.vtx"; + if ($_ == 101) { + print "

Politik

"; + } elsif ($_ == 102) { + print "

Chronik

"; + } elsif ($_ == 706) { + print "

Tirol

"; + } elsif ($_ == 108) { + print "

Web/Media

"; + } + my @subpages = glob "/run/ttxd/spool/2/${_}_*.vtx"; + foreach (@subpages) { + my $file_age = time() - (stat ($_))[9]; + print `./html.pl $_` if $file_age < 1000; + #print "

$_ : $file_age

" if $file_age > 1000; + } + #print `./html.pl /run/ttxd/spool/2/${_}_00.vtx` if -e "/run/ttxd/spool/2/${_}_00.vtx"; } print ""; diff --git a/html.pl b/html.pl index 7ce82f0..bc3eef8 100755 --- a/html.pl +++ b/html.pl @@ -11,40 +11,23 @@ use strict; use warnings; use 5.010; +# convert to utf-8 before outputting; WARN: still processes latin-1 (regexes too)! binmode STDOUT, ":encoding(utf8)"; -# Seitenformat: -# 100-109: -# Metadaten: 1 -# Subressort: 2 -# Ressort/Sparte: 3 -# Leer 4 -# Related: 5 -# Leer 6 -# Titel: 7 -# Text: 8-24 -# -# 112-899: -# Metadaten: 1 -# Subressort: 2 -# Ressort/Sparte: 3 -# Leer 4 -# Titel: 5 -# Text: 6-24 -# - my %meta; my $title; my $text = ""; +# TODO: run tzap/dvbtext in background if not already running my $page = shift; -my $subp = 0; #TODO: allow subpages +my ($subpage) = $page=~m/\d{3}_(\d{2})/; #requires ppp_ss file name scheme! @parens: https://stackoverflow.com/a/10034105 +$subpage += 0; # convert to number to remove leading zero # run through vtx2ascii (has been modified to output correct ISO 8859-1 without national replacements) open (VTX, "./vtx2ascii -a $page |") || die ("Can'r run vtx2ascii"); my $last = ""; my $is_10x = 0; do { - # transliterate from ETSI EN 300 706 G0 German to Latin-1 (will be converted to UTF-8 by perl): - tr/[\\]{|}~/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}/; + # transliterate from ETSI EN 300 706 G0 German to latin-1 (ÃÃÃÃ¤Ã¶Ã¼ÃÂ°Â§): + tr/[\\]{|}~`@/\N{U+C4}\N{U+D6}\N{U+DC}\N{U+E4}\N{U+F6}\N{U+FC}\N{U+DF}\N{U+B0}\N{U+A7}/; my $line = $_; $line =~ s/^\s+|\s+$//g; chomp ($line); @@ -56,21 +39,37 @@ do { when (4) {} when (5 + (1*$is_10x)) { $title = $line } when (4 + (1*$is_10x)) {} - when (4 + (3*$is_10x)) { $title .=$line } + when (4 + (3*$is_10x)) { $title .=$line if ($title eq "")} default { $text .= $last . "_EOL_" . ($last eq ""?"":($line eq ""?"
":" ")) } } $last = $line unless $. == (5+(2*$is_10x)); } while (); $text .= $last; -#remove hyphenation at original line ending only when in between lowercase letters, replace with soft hyphen to still allow hyphenation when needed. ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way due to single line regex. +# substitute hyphenation: +# * replace with soft hyphen when it splits a word (between lowercase letters; +# still allows line break when necessary. +# * keep, when followed by uppercase letter (e.g. "03-JÃ¤hrige", "PIN-Nummer") +# * remove in any other case (was: forced hyphenation due to space constraints) +# ad _EOL_: linebreaks already stripped in loop above; wouldn't work either way +# due to single line regex. INFO: Underscore is in DE-localized teletext charset. $text =~ s/([[:lower:]])-_EOL_ ([[:lower:]])/\1\2/g; +$text =~ s/([[:alnum:]])-_EOL_ ([[:upper:]])/\1-\2/g; $text =~ s/_EOL_//g; -# adblocker: just add more regexes +# remove ORFText idiosyncrasies +$text =~ s/([[:alnum:]]),([[:alnum:]])/\1, \2/g; # no space after comma to save space +$text =~ s/([[:alnum:]])\.([[:upper:][:digit:]])/\1. \2/g; # no space after period to save space (WARN: breaks URLS like tirol.ORF.at) + +# adblocker: (keep it 7bit-ASCII; perl processes latin1, but output will be utf8) $text =~ s/Kalendarium - t.glich neu \. 734//g; +$text =~ s/>>tirol\. ?ORF\.at//g; -print "

$meta{'page'}: $title
$text

"; +my @tmp = split(' ',$meta{'date'}); +my $shortdate = substr $tmp[1], 0, 6; +my $pagesubpage = $meta{'page'} . ($subpage > 0?".$subpage":""); +my $moreinfo = "$meta{'res'} - $meta{'subres'}; $meta{'date'}"; +print "

$pagesubpage: $title
$text

"; close (VTX); -- 2.39.3