#! /usr/bin/perl
#use utf8;
use XML::Parser::Lite;
use Socket;
use Encode qw(:all);
#use threads;
#use threads::shared;
alarm(9.5*60);
$url='http://news.eu.by/';
$subject='Belarus';
$root=substr($0,0,rindex($0,'/'));
$pub="$root/dir";
$web="$root/html";
$temp="$root/tmp";
$xmlname="newz";
$deltaname="newz";
$htmlname="newz";
#my $enc='.gz';
$enc='';
$title='Breaking News!';
$rotate_time=5400; # sec;
$proxy='195.50.2.154:8080';
@or=('Belarus','Belorussia','Byelorussia','Belarussian','Byelorussian','Bielorussia','Bielorusso','Bielorussa','Belarusse');
$|=1;
do "$root/fin";
my $encoding='windows-1251';
my $fail=0;
my $heads=qq(
$title
);
my $ad1=q(
);
my $ad2=q(
);
my %cont:shared=(
'mahatma.bspu.unibel.by'=>'Dzianis Kahanovich',
'www.bspu.unibel.by'=>'Belarussian State Pedagogical University',
'www.belta.by'=>'BELTA',
'news.google.com'=>'Google News',
'finance.google.com'=>'Google Finance'
);
my $or_='+OR+';
#my $or_='+%7C+';
my %goo=(
''=>{q=>join($or_,@or)},
'de'=>{ned=>'tde',q=>join($or_,('Wei%C3%9Frussland','Belarus','Belorussland','Wei%C3%9Fru%C3%9Fland','Beloru%C3%9Fland','wei%C3%9Frussisch','belarussich','wei%C3%9Fruthenisch','Belarusse','Wei%C3%9Frusse','Belarussin','Wei%C3%9Frussin','Wei%C3%9Frussische'))},
'nl_nl'=>{ned=>'tnl_nl',q=>join($or_,('Wit-Rusland','Wit-Russisch','Wit-Rus','Wit-Russin','Wit-Russische','Bjelo-Rusland','Bjelorussisch','Bjelorussische','Bjelorus','Belarus'))},
'fr'=>{ned=>'tfr',q=>join($or_,('Belarus','Bi%C3%A9lorussie','Bi%C3%A9lorusse','Bielaroussy','Bielarouss','Bi%C3%A9larussie','Bi%C3%A9larusse'))},
'pt-PT_pt'=>{ned=>'tpt-PT_pt',q=>join($or_,('Bielo-R%C3%BAssia','Bielor%C3%BAssia','Bielorr%C3%BAssia'))}
);
my $ya=join('%7C','%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%EE%F0%F3%F1%F1%EA%E8%E9');
my @lang=('en','es','fr','de','nl_nl','it','pt-PT_pt','pt-BR_br');
my %lang_web=('en'=>2,'ru'=>1,'fr'=>2,'de'=>2,'nl_nl'=>2,'it'=>2,'pt-PT_pt'=>2,'pt-BR_br'=>2);
my $results=0;
sub fn{
substr($_[1],-3) eq '.gz'?($_[0]?"|gzip -9 >$_[1]":"gzip -dc <$_[1]|"):($_[0]?">$_[1]":"<$_[1]");
}
sub add_cont{
my $l=$_[0].'/';
my $x;
$l=~s/http\:\/\/(.*?)[\:\/]/$x=$1;''/gsei;
$cont{$x}||=$_[1];
}
sub cur_cont{
my ($i,$r);
for(sort keys %cont){
$i=unesc($_);
$r.="
".url("http://$i",$cont{$_}) if(index($_[0],$_)>=0 || index($_[0],$i)>=0 || $_ ne $cont{$_});
}
$r
}
sub esc{
my $x=shift;
$x=~s/([\x00-\x1f,:\"\'\\\/])/sprintf('%%%02X',ord($1))/eg;
$x;
}
sub unesc{
my $x=shift;
local $1;
$x=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$x;
}
sub qm{
#quotemeta($_[0])
my $x=shift;
$x=~s/([\'\\])/\\$1/gs;
$x=~s/\r/\\r/gs;
$x=~s/\n/\\n/gs;
$x;
}
sub get_xml{
my ($s,%h,@a,@a1,@ad);
@a=split(/:\/\//,$_[0],2);
unshift @a,'http' if(!defined($a[1]));
@a[1,3]=split(/\//,$a[1],2);
@ad=@a[1,2]=split(/:/,$a[1],2);
@a1=@a;
$ad[1]||=80;
$a1[0]&&="$a[0]://";
$a1[2]&&=":$a[2]";
$a1[3]&&="/$a[3]";
if($proxy){
@ad=split(/:/,$proxy,2);
$a1[3]=join('',@a1);
}
print "+";
socket(SO,PF_INET,SOCK_STREAM,PROTO_TCP)&&
connect(SO,sockaddr_in($ad[1],inet_aton($ad[0])))&&goto OK;
close(SO);
$fail++;
return;
OK:
select(SO);$|=1;select(STDOUT);
print SO qq($_[1] $a1[3] HTTP/1.1
Host: $a1[1]
User-Agent: robot $url
Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5
Accept-Language: ru,be;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding: none
$_[2]Connection: close
);
while((!eof(SO))&&defined(my $x=)){$s.=$x;$x=~s/[\r\n]*//gs;$x||last}
$s=~s/(.*?): (.*?)[\r]\n/$h{lc($1)}=$2/gise;
undef $s;
while((!eof(SO))&&defined(my $x=)){$s.=$x}
close(SO);
my $s1;
$s=~s/.*(<\?xml.*?\/rss>).*/$s1=$1;''/se;
$s1,%h
}
sub url{
my $u=$_[0];
my $t=$_[1]||unesc($u);
"$t"
}
my (@block,%item,%channel,@items,$cnt0,$cnt0_);
## 'id'=>[start,char,end,start1,char1.end1];
my %blocks=(
'rss.channel'=>[
sub{%channel=()},
undef,
undef,
undef,
sub{shift;$channel{$block[$#block]}=join('',@_)}
],
'rss.channel.item'=>[
sub{%item=()},
undef,
sub{push @items,{%item};undef %item},
undef,
sub{shift;$item{$block[$#block]}=join('',@_)},
undef
],
'rss.channel.item.link'=>[
]
);
#$blocks{'rdf:RDF.item'}=$blocks{'rss.channel.item'};
my %handlers=(
Start=>sub{parser_event(3,@_);push @block,$_[1];unshift @_,0;goto &parser_event},
Char=>sub{parser_event(1,@_);unshift @_,4;goto &parser_event},
End=>sub{parser_event(2,@_);while($_[1] ne pop @block){};unshift @_,5;goto &parser_event},
);
sub parser_event{
my $e=shift;
my $id=join('.',@block[0..$#block-($e>3)]);
#print "$id\n";
if(exists($blocks{$id})){
my $h=$blocks{$id};
goto ref($h)||return;
HASH:return $h->{('Start','Char','End')[$e]}(@_);
ARRAY:return defined(@$h[$e])?&{@$h[$e]}(@_):undef;
SCALAR:return;
}
}
sub get_rss{
my $url=shift;
my $retry=2;
RETRY:
my ($x,%h,$ff,$ffb,$t,$p,$lm);
print "get $url\n";
if(substr($url,0,7) eq 'file://'){
$ff=substr($url,7);
return if(!-e $ff);
goto FILE;
}
add_cont($url);
($x,%h)=get_xml($url,'HEAD');
return if(!defined(%h));
if(exists($h{'last-modified'})){
$ffb="$temp/newz-".esc($url);
stat($ff="$ffb.".esc($h{'last-modified'}));
if(-e _){
FILE:
print "== $ff\n";
open(FF,fn(0,$ff)) or die "$! $ff";
$x='';
while(!eof(FF)&&(my $s=)){$x.=$s}
close(FF);
}else{
($x,%h)=get_xml($url,'GET');
return if(!defined($x));
}
}else{($x,%h)=get_xml($url,'GET');
}
$p=new XML::Parser::Lite;
$p->setHandlers(%handlers);
my ($enc,@a);
$x=~s/\<\?xml(.*?)\>/push @a,$1;"\<\?xml$1\>"/ges;
for(@a){$_=~s/ encoding\=\"(.*?)\"/$enc=lc($1);" encoding=\"$encoding\""/ges}
from_to($x,$enc,$encoding,HTMLCREF) if(defined($enc) && $enc ne $encoding);
undef %channel;
@items=();
if(index($x,'/rss>')<0){print "\nERROR: $url\n";$retry--?goto RETRY:return}
$p->parse($x);
## debug:
#$ff=substr(($ffb="$temp/newz-".esc($url)).".".esc($h{'last-modified'}||=gmtime),0,128);
if(exists($h{'last-modified'})){
while(my $d=<$ffb.*>){unlink($d)}
open(FF,fn(1,$ff)) or die $!;
print FF $x;
close(FF);
}
$x=$channel{title};
$x=~s/\:.*//s;
add_cont($channel{link},$x) if($channel{link});
addnews(@_);
}
my %htm=(
'lt'=>'<',
'gt'=>'>',
'amp'=>'&',
'quot'=>'"'
);
sub dehtml{
my $s=shift;
$s=~s/\&(.*?)\;/$htm{$1}||"\&$1;"/gse;
$s
}
sub htmlz{
my $s=shift;
for(keys %htm){$s=~s/$htm{$_}/$_/gs}
}
my @news0:shared;
my %news:shared;
my %nh:shared;
my %mm=('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11);
sub nkey{$_[0]->{xlink}||$_[0]->{link}||$_[0]->{description}}
# [param_redirect[,lang[,no_sort_by_time]]]
sub addnews{
my ($l,$x);
my $u=$_[0];
for(@items){
$l=$_->{link};
$_->{lang}||=$_[1];
if($u){
$x="$l\&";
$x=~s/[\&\?\;]$u\=(.*?)\&/$l=dehtml($1);''/gse;
if($x ne "$l\&"){
$l="http://$l" if(index($l,'://')==-1);
$_->{xlink}=$l;
}
};
$l=nkey($_);
if(exists($nh{$l})){$nh{$l}++}
else{
if($_[2]){
$nh{$l}=0;
push @news0,$_;
}else{
$nh{$l}=1;
my ($t1,$t)=(0,$_->{pubDate});
$t=~s/([0-9]{2})\:([0-9]{2})\:([0-9]{2})/$t1=$3+($2+$1*60)*60;''/e;
$t=~s/([0-9]{1,2}) ([a-zA-Z]{3}) ([0-9]{4})/$t1+=($1+$mm{$2}*31+$3*365)*24*60*60;''/e;
$t=~s/\+0([0-9])00/$t1-=$1*60*60;''/ex;
$t=~s/\-0([0-9])00/$t1+=$1*60*60;''/ex;
$t1++ while(exists($news{$t1}));
$news{$t1}=$_
}
add_cont($l)
}
}
}
sub mv{rename($_[0],$_[1])||`mv -f $_[0] $_[1]`}
my $time=gmtime;
sub time2h{
my $x="$_[0] GMT";
$x=~s/ /\ \;/gs;
$x
}
my ($t0,$time0,$tstamp,$counter,$rotate);
if(open(FT,"<$pub/time")){
$t0=;chomp($t0);
$time0=;chomp($time0);
$counter=;chomp($counter);
$tstamp=;chomp($tstamp);
close FT;
print "time: ",time-$t0,"\n";
};
$tstamp||=time;
sub cp{
my $x=quotemeta($_[0]);
my $y=quotemeta($_[1]);
`cp -f $x $y`
}
if($ARGV[0] eq 'test'){$web=$pub=$temp;}
get_rss("file://$pub/$xmlname.xml$enc",undef,undef,1);$cnt0_=$cnt0=$#news0;
if($ARGV[0] ne 'test' && (time-$t0>$rotate_time || $ARGV[0] eq 'rotate')){
cp("$pub/$xmlname.xml$enc","$pub/$xmlname.$time.xml$enc");
cp("$pub/$htmlname.html$enc","$pub/$htmlname.$time.html$enc");
undef $t0;
$rotate=1;
$cnt0=-1;
}
if(!defined($t0)){($t0,$time0,$counter)=(time,$time,0)}
my $hhead=qq(
".(exists($_->{'xlink'})?url(unesc($x),'link').' ':'').$s;
push @fdelta,qm($s) if($cnt>=$cnt0);
}
print FF "\n";
for my $tag(keys %{$_}){print FF "<$tag>$_->{$tag}$tag>"}
print FF "";
}
if($cnt==$cnt0_){
$tstamp=time;
$fh.='';
@fdelta[$#fdelta].='' if($cnt>$cnt0);
}
}
print FF "\n";
close FF;
my $cont_="
Contributors:
".cur_cont($fh)."
";
$fh.="\n\n$cont_\n$ad1$ad2";
open(FH,fn(1,"$pub/$htmlname.tmp.html$enc")) or die "$! $pub/$htmlname.tmp.html";
print FH $fh;
close(FH);
if($#fdelta>=0){
$counter++;
open FH,fn(1,"$web/$deltaname$counter.js$enc");
print FH "d([";
my $c='';
for(@fdelta){print FH "$c'".$_."'";$c=','};
print FH "])";
close(FH);
for(my $i=$counter+1;unlink "$web/$deltaname$i.js$enc";$i++){}
my $fj;
for(1..$counter){$fj="$fj"}
$fj=qq($heads
$fj
$ad1
.);
#
$ad2
.
open(FH,fn(1,"$web/index-j.html$enc"));
print FH $fj;
close(FH);
}
close(FH);
mv("$pub/$htmlname.tmp.html$enc","$pub/$htmlname.html$enc");
mv("$pub/$xmlname.tmp.xml$enc","$pub/$xmlname.xml$enc");
#if($rotate){
unlink("$pub/time");
open(FT,">$pub/time");
print FT join("\n",$t0,$time0,$counter,$tstamp);
close FT;
#}
__END__
License: Anarchy.
Все стихийные (включая социальные (включая юридические, моральные и т.д.))
аспекты существования и использования данного кода являются форс-мажорными
обстоятельствами и автора не интересуют.
(c) mahatma, 29.09.2006