#! /usr/bin/perl #use utf8; use XML::Parser::Lite; use Socket; use Encode qw(:all); #use threads; #use threads::shared; alarm(9.5*60); $url='http://news.eu.by/'; $subject='Belarus'; $root=substr($0,0,rindex($0,'/')); $pub="$root/dir"; $web="$root/html"; $temp="$root/tmp"; $xmlname="newz"; $deltaname="newz"; $htmlname="newz"; #my $enc='.gz'; $enc=''; $title='Breaking News!'; $rotate_time=5400; # sec; $proxy='195.50.2.154:8080'; @or=('Belarus','Belorussia','Byelorussia','Belarussian','Byelorussian','Bielorussia','Bielorusso','Bielorussa','Belarusse'); $|=1; do "$root/fin"; my $encoding='windows-1251'; my $fail=0; my $heads=qq($title ); my $ad1=q( ); my $ad2=q( ); my %cont:shared=( 'mahatma.bspu.unibel.by'=>'Dzianis Kahanovich', 'www.bspu.unibel.by'=>'Belarussian State Pedagogical University', 'www.belta.by'=>'BELTA', 'news.google.com'=>'Google News', 'finance.google.com'=>'Google Finance' ); my $or_='+OR+'; #my $or_='+%7C+'; my %goo=( ''=>{q=>join($or_,@or)}, 'de'=>{ned=>'tde',q=>join($or_,('Wei%C3%9Frussland','Belarus','Belorussland','Wei%C3%9Fru%C3%9Fland','Beloru%C3%9Fland','wei%C3%9Frussisch','belarussich','wei%C3%9Fruthenisch','Belarusse','Wei%C3%9Frusse','Belarussin','Wei%C3%9Frussin','Wei%C3%9Frussische'))}, 'nl_nl'=>{ned=>'tnl_nl',q=>join($or_,('Wit-Rusland','Wit-Russisch','Wit-Rus','Wit-Russin','Wit-Russische','Bjelo-Rusland','Bjelorussisch','Bjelorussische','Bjelorus','Belarus'))}, 'fr'=>{ned=>'tfr',q=>join($or_,('Belarus','Bi%C3%A9lorussie','Bi%C3%A9lorusse','Bielaroussy','Bielarouss','Bi%C3%A9larussie','Bi%C3%A9larusse'))}, 'pt-PT_pt'=>{ned=>'tpt-PT_pt',q=>join($or_,('Bielo-R%C3%BAssia','Bielor%C3%BAssia','Bielorr%C3%BAssia'))} ); my $ya=join('%7C','%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%EE%F0%F3%F1%F1%EA%E8%E9'); my @lang=('en','es','fr','de','nl_nl','it','pt-PT_pt','pt-BR_br'); my %lang_web=('en'=>2,'ru'=>1,'fr'=>2,'de'=>2,'nl_nl'=>2,'it'=>2,'pt-PT_pt'=>2,'pt-BR_br'=>2); my $results=0; sub fn{ substr($_[1],-3) eq '.gz'?($_[0]?"|gzip -9 >$_[1]":"gzip -dc <$_[1]|"):($_[0]?">$_[1]":"<$_[1]"); } sub add_cont{ my $l=$_[0].'/'; my $x; $l=~s/http\:\/\/(.*?)[\:\/]/$x=$1;''/gsei; $cont{$x}||=$_[1]; } sub cur_cont{ my ($i,$r); for(sort keys %cont){ $i=unesc($_); $r.="
  • ".url("http://$i",$cont{$_}) if(index($_[0],$_)>=0 || index($_[0],$i)>=0 || $_ ne $cont{$_}); } $r } sub esc{ my $x=shift; $x=~s/([\x00-\x1f,:\"\'\\\/])/sprintf('%%%02X',ord($1))/eg; $x; } sub unesc{ my $x=shift; local $1; $x=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $x; } sub qm{ #quotemeta($_[0]) my $x=shift; $x=~s/([\'\\])/\\$1/gs; $x=~s/\r/\\r/gs; $x=~s/\n/\\n/gs; $x; } sub get_xml{ my ($s,%h,@a,@a1,@ad); @a=split(/:\/\//,$_[0],2); unshift @a,'http' if(!defined($a[1])); @a[1,3]=split(/\//,$a[1],2); @ad=@a[1,2]=split(/:/,$a[1],2); @a1=@a; $ad[1]||=80; $a1[0]&&="$a[0]://"; $a1[2]&&=":$a[2]"; $a1[3]&&="/$a[3]"; if($proxy){ @ad=split(/:/,$proxy,2); $a1[3]=join('',@a1); } print "+"; socket(SO,PF_INET,SOCK_STREAM,PROTO_TCP)&& connect(SO,sockaddr_in($ad[1],inet_aton($ad[0])))&&goto OK; close(SO); $fail++; return; OK: select(SO);$|=1;select(STDOUT); print SO qq($_[1] $a1[3] HTTP/1.1 Host: $a1[1] User-Agent: robot $url Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5 Accept-Language: ru,be;q=0.8,en-us;q=0.5,en;q=0.3 Accept-Encoding: none $_[2]Connection: close ); while((!eof(SO))&&defined(my $x=)){$s.=$x;$x=~s/[\r\n]*//gs;$x||last} $s=~s/(.*?): (.*?)[\r]\n/$h{lc($1)}=$2/gise; undef $s; while((!eof(SO))&&defined(my $x=)){$s.=$x} close(SO); my $s1; $s=~s/.*(<\?xml.*?\/rss>).*/$s1=$1;''/se; $s1,%h } sub url{ my $u=$_[0]; my $t=$_[1]||unesc($u); "$t" } my (@block,%item,%channel,@items,$cnt0,$cnt0_); ## 'id'=>[start,char,end,start1,char1.end1]; my %blocks=( 'rss.channel'=>[ sub{%channel=()}, undef, undef, undef, sub{shift;$channel{$block[$#block]}=join('',@_)} ], 'rss.channel.item'=>[ sub{%item=()}, undef, sub{push @items,{%item};undef %item}, undef, sub{shift;$item{$block[$#block]}=join('',@_)}, undef ], 'rss.channel.item.link'=>[ ] ); #$blocks{'rdf:RDF.item'}=$blocks{'rss.channel.item'}; my %handlers=( Start=>sub{parser_event(3,@_);push @block,$_[1];unshift @_,0;goto &parser_event}, Char=>sub{parser_event(1,@_);unshift @_,4;goto &parser_event}, End=>sub{parser_event(2,@_);while($_[1] ne pop @block){};unshift @_,5;goto &parser_event}, ); sub parser_event{ my $e=shift; my $id=join('.',@block[0..$#block-($e>3)]); #print "$id\n"; if(exists($blocks{$id})){ my $h=$blocks{$id}; goto ref($h)||return; HASH:return $h->{('Start','Char','End')[$e]}(@_); ARRAY:return defined(@$h[$e])?&{@$h[$e]}(@_):undef; SCALAR:return; } } sub get_rss{ my $url=shift; my $retry=2; RETRY: my ($x,%h,$ff,$ffb,$t,$p,$lm); print "get $url\n"; if(substr($url,0,7) eq 'file://'){ $ff=substr($url,7); return if(!-e $ff); goto FILE; } add_cont($url); ($x,%h)=get_xml($url,'HEAD'); return if(!defined(%h)); if(exists($h{'last-modified'})){ $ffb="$temp/newz-".esc($url); stat($ff="$ffb.".esc($h{'last-modified'})); if(-e _){ FILE: print "== $ff\n"; open(FF,fn(0,$ff)) or die "$! $ff"; $x=''; while(!eof(FF)&&(my $s=)){$x.=$s} close(FF); }else{ ($x,%h)=get_xml($url,'GET'); return if(!defined($x)); } }else{($x,%h)=get_xml($url,'GET'); } $p=new XML::Parser::Lite; $p->setHandlers(%handlers); my ($enc,@a); $x=~s/\<\?xml(.*?)\>/push @a,$1;"\<\?xml$1\>"/ges; for(@a){$_=~s/ encoding\=\"(.*?)\"/$enc=lc($1);" encoding=\"$encoding\""/ges} from_to($x,$enc,$encoding,HTMLCREF) if(defined($enc) && $enc ne $encoding); undef %channel; @items=(); if(index($x,'/rss>')<0){print "\nERROR: $url\n";$retry--?goto RETRY:return} $p->parse($x); ## debug: #$ff=substr(($ffb="$temp/newz-".esc($url)).".".esc($h{'last-modified'}||=gmtime),0,128); if(exists($h{'last-modified'})){ while(my $d=<$ffb.*>){unlink($d)} open(FF,fn(1,$ff)) or die $!; print FF $x; close(FF); } $x=$channel{title}; $x=~s/\:.*//s; add_cont($channel{link},$x) if($channel{link}); addnews(@_); } my %htm=( 'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"' ); sub dehtml{ my $s=shift; $s=~s/\&(.*?)\;/$htm{$1}||"\&$1;"/gse; $s } sub htmlz{ my $s=shift; for(keys %htm){$s=~s/$htm{$_}/$_/gs} } my @news0:shared; my %news:shared; my %nh:shared; my %mm=('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11); sub nkey{$_[0]->{xlink}||$_[0]->{link}||$_[0]->{description}} # [param_redirect[,lang[,no_sort_by_time]]] sub addnews{ my ($l,$x); my $u=$_[0]; for(@items){ $l=$_->{link}; $_->{lang}||=$_[1]; if($u){ $x="$l\&"; $x=~s/[\&\?\;]$u\=(.*?)\&/$l=dehtml($1);''/gse; if($x ne "$l\&"){ $l="http://$l" if(index($l,'://')==-1); $_->{xlink}=$l; } }; $l=nkey($_); if(exists($nh{$l})){$nh{$l}++} else{ if($_[2]){ $nh{$l}=0; push @news0,$_; }else{ $nh{$l}=1; my ($t1,$t)=(0,$_->{pubDate}); $t=~s/([0-9]{2})\:([0-9]{2})\:([0-9]{2})/$t1=$3+($2+$1*60)*60;''/e; $t=~s/([0-9]{1,2}) ([a-zA-Z]{3}) ([0-9]{4})/$t1+=($1+$mm{$2}*31+$3*365)*24*60*60;''/e; $t=~s/\+0([0-9])00/$t1-=$1*60*60;''/ex; $t=~s/\-0([0-9])00/$t1+=$1*60*60;''/ex; $t1++ while(exists($news{$t1})); $news{$t1}=$_ } add_cont($l) } } } sub mv{rename($_[0],$_[1])||`mv -f $_[0] $_[1]`} my $time=gmtime; sub time2h{ my $x="$_[0] GMT"; $x=~s/ /\ \;/gs; $x } my ($t0,$time0,$tstamp,$counter,$rotate); if(open(FT,"<$pub/time")){ $t0=;chomp($t0); $time0=;chomp($time0); $counter=;chomp($counter); $tstamp=;chomp($tstamp); close FT; print "time: ",time-$t0,"\n"; }; $tstamp||=time; sub cp{ my $x=quotemeta($_[0]); my $y=quotemeta($_[1]); `cp -f $x $y` } if($ARGV[0] eq 'test'){$web=$pub=$temp;} get_rss("file://$pub/$xmlname.xml$enc",undef,undef,1);$cnt0_=$cnt0=$#news0; if($ARGV[0] ne 'test' && (time-$t0>$rotate_time || $ARGV[0] eq 'rotate')){ cp("$pub/$xmlname.xml$enc","$pub/$xmlname.$time.xml$enc"); cp("$pub/$htmlname.html$enc","$pub/$htmlname.$time.html$enc"); undef $t0; $rotate=1; $cnt0=-1; } if(!defined($t0)){($t0,$time0,$counter)=(time,$time,0)} my $hhead=qq(
    Open Source News $subject). '~~'.($rotate_time/3600).'h: '.time2h($time0)." - ".time2h($time)."
    ". url("/dir/")." ".url("/dir/".esc("$htmlname.$time0.html$enc"),"$time0 GMT")."
    "; my $fh=qq($heads $hhead
      ); #get_rss('http://newsrss.bbc.co.uk/rss/russian/institutional/pda/rss.xml'); if($ARGV[0] eq 'test'){get_rss("file://$temp/fin.xml",undef,'en');goto noget;} goto noget if($ARGV[0] eq 'html'); for(@lang){ for(my $st=0;$st<=$results;$st+=10){ # my $x='http://news.google.com/news?hl=en&ned=tus&ie=UTF-8&scoring=d&q=Belarus+OR+Belorussia+OR+Belarussian&output=rss'; my $x='http://news.google.com/news?ie=UTF-8&scoring=d&output=rss'; my $ll=$_; if(!exists($goo{$_})){ $ll=''; $x.="\&ned=t$_"; } $x.="\&start=$st" if($st); for my $k (keys %{$goo{$ll}}){$x.="\&$k=$goo{$ll}->{$k}"} get_rss($x,'url',$_); last if($#items<0); } } #get_rss('http://www.afn.by/news/rss/',undef,'ru'); #get_rss('http://www.euronews.net/rss/euronews_ru.xml',undef,'ru'); get_rss("file://$temp/fin.xml",undef,'en'); get_rss('http://news.tut.by/rss/all.rss',undef,'ru'); get_rss('http://www.charter97.org/export/index.xml',undef,'ru'); get_rss('http://news.yandex.ru/Belarus/index.rss','cl4url','ru'); #get_rss('http://blogs.yandex.ru/search.rss?how=tm&rd=2&text='.$ya,undef,'ru'); get_rss('http://blogs.yandex.ru/search.rss?how=tm&rd=2&text='.$ya.'&searchtarget_blogs=on',undef,'ru'); #get_rss('http://www.blogdigger.com/search?q=Belarus&sortby=date&type=rss',undef,undef); #get_rss('http://www.belta.by/by/belta.rss',undef,'by'); get_rss('http://www.belta.by/ru/belta.rss',undef,'ru'); die "$fail failures!!" if($fail>2); noget: alarm(0); my $cont0; for(sort keys %cont){ $i=unesc($_); $cont0.="
    • ".url("http://$i",$cont{$_}) if($cont{$_} && $_ ne $cont{$_}); } if(defined(&threads::list)){my @l=threads->list;for(@l){$_->join}} for(sort keys %news){unshift @news0,delete($news{$_})} open(FF,fn(1,"$pub/$xmlname.tmp.xml$enc")) or die "$! ".fn(1,"$pub/$xmlname.tmp.xml$enc"); print FF qq( $title $url Open Source News); my @fdelta=(); my $cnt=$#news0; print "CNT: $cnt, $cnt0\n"; for(@news0){ $cnt--; if((!$rotate)||$nh{nkey($_)}){ my ($t,$s,$x,$d)=(dehtml($_->{title}),dehtml($_->{description}),dehtml($_->{xlink})); if(substr($_->{link},0,23) eq 'http://news.google.com/'){ $s=~s/
      .*?(.*?)<\/font>.*?<\/font>
      (.*?)<\/font>.*?<\/table>.*/for(my $i=length($1);$i>=0;$i--){if(substr($t,-$i) eq substr($1,0,$i)){substr($t,-$i)='';last}};$d=$1;"$1<\/i> $2"/gse; $d=~s/\ \;\-//gs; $x&&add_cont($x,dehtml($d)); } if($lang_web{$_->{lang}}||!exists($_->{lang})){ $s=($lang_web{$_->{lang}}==2?"[$_->{lang}] ":'').' '.''.url($_->{'link'},$t)." $s"; $fh.="
    • ".(exists($_->{'xlink'})?url(unesc($x),'link').' ':'').$s; push @fdelta,qm($s) if($cnt>=$cnt0); } print FF "\n"; for my $tag(keys %{$_}){print FF "<$tag>$_->{$tag}"} print FF ""; } if($cnt==$cnt0_){ $tstamp=time; $fh.='
      '; @fdelta[$#fdelta].='
      ' if($cnt>$cnt0); } } print FF "\n"; close FF; my $cont_="
    • Contributors:
        ".cur_cont($fh)."
      "; $fh.="\n
      \n$cont_\n
    $ad1$ad2"; open(FH,fn(1,"$pub/$htmlname.tmp.html$enc")) or die "$! $pub/$htmlname.tmp.html"; print FH $fh; close(FH); if($#fdelta>=0){ $counter++; open FH,fn(1,"$web/$deltaname$counter.js$enc"); print FH "d(["; my $c=''; for(@fdelta){print FH "$c'".$_."'";$c=','}; print FH "])"; close(FH); for(my $i=$counter+1;unlink "$web/$deltaname$i.js$enc";$i++){} my $fj; for(1..$counter){$fj="$fj"} $fj=qq($heads $fj $ad1 .); #
    $ad2

    . open(FH,fn(1,"$web/index-j.html$enc")); print FH $fj; close(FH); } close(FH); mv("$pub/$htmlname.tmp.html$enc","$pub/$htmlname.html$enc"); mv("$pub/$xmlname.tmp.xml$enc","$pub/$xmlname.xml$enc"); #if($rotate){ unlink("$pub/time"); open(FT,">$pub/time"); print FT join("\n",$t0,$time0,$counter,$tstamp); close FT; #} __END__ License: Anarchy. Все стихийные (включая социальные (включая юридические, моральные и т.д.)) аспекты существования и использования данного кода являются форс-мажорными обстоятельствами и автора не интересуют. (c) mahatma, 29.09.2006