#!/usr/bin/perl print <<EOF; <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n\n <title>Untitled Document</title> </head> <body> EOF use CGI::Carp "fatalsToBrowser"; use strict; use utf8; use HTML::Entities; # loading a site from www #my $string=lade_seite("http://www.livebet.gr/"); #my $string=lade_seite("http://www.imerisia.gr"); my $string=lade_seite("http://www.hidemail.de/blog"); #my $string=lade_seite("http://www.wsws.org/tr/2005/nov2005/turk-o16.shtml"); #my $string=lade_seite("http://www.hi.is/"); #my $string=lade_seite("http://www.stern.de/"); decode_entities($string); # we want only lower-case $string=lc($string); # a is nothing more than a space for us $string=~ s/\ \;/ /g; #deleting <script>-areas $string=~ s/<script.*?\/script>/\n/gis; # getting the title from the page $string =~ s/<title.*?>(.*?)<\/title.*?>//s; my $title=$1; $title="No Title" if $title eq ''; $string=~ s/.*?<body.*?>//gis; # cut from the head to the body, we want only the content print "Title $title<br>"; # delete the html-tags $string =~ s{ < (?: [^>'"] * | ".*?" | '.*?' ) + > }{ }gsx; # decode the html-entities -> make ä from ä and so on # replace signs like . with \n $string=~ tr/ \(\)\.\-\{\}\[\]\?\`\´\'\+\*\#\_\:\;\,\|\<\>\!\"\§\$\%\&\/\\\t\r\©\€\µ\”\“\»\=\@\µ/\n/; # delete x-\n's, we need only one \n $string=~ s/\n+/\n/gs; utf8::encode($string); my %woerter=(); #count words longer than 1 character grep {$woerter{$_}++ if length($_) >1} split(/\n/,$string); # and show foreach (sort(keys %woerter)){print "$_: $woerter{$_} mal<br>\n";} open (out,">test.txt") || die ("fehler"); #binmode(out, ":utf8"); foreach (sort(keys %woerter)){print out "$_: $woerter{$_} mal<br>\n";} close out; ####################### #lädt eine Seite aus dem Internet #################################### sub lade_seite{ use LWP::UserAgent; use HTTP::Request; use Encode; my ($url) = @_; print $url; my $content; my $encoding; my $request = HTTP::Request->new(GET => $url); my $ua = LWP::UserAgent->new; $ua->agent('User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4'); my $response = $ua->request($request); if($response->is_success) { $content = $response->content; $content=~ /charset\=(.*?)\"/; my $charset=$1; #print "<br>Charset: $charset<br>"; $charset="utf8" if $charset eq '' || $charset eq 'UTF-8'; $content=decode($charset,$content); return $content; # Content zurück } else { return ; } } ################################################################ #entfernt aus einer Liste doppelte einträge ################################################################ sub del_double{ my %all=(); @all{@_}=1; return (keys %all); } |