You are here iC Home > Perl > XML with Perl > RSS 2.0

Perl

5.1 RSS 2.0

20.11.2008
5 XML with Perl [  up  ] - [ a - z ] - [ search PC ] - [ top ] 5.2 XML RPC

rss_2_0_simple.pl

Download: → /perl/ files/rss 2 0 simple pl.txt
#!/usr/bin/perl -w
use strict;
$| = 1;
################################
# rss_2_0.pl
################################

use XML::GDOME;

# -- GLOBAL
my $DEBUG = 0;
my $doc = XML::GDOME->createDocFromURI($ARGV[0]);
my @nodes = $doc->findnodes("//*");
my $do = my $i = 0;

foreach my $node (@nodes) {
        my @childs = $node->childNodes;

        foreach my $child (@childs) {
                if($child->nodeType == ELEMENT_NODE) {
                        my $data = defined $child->firstChild() ? 
                               $child->firstChild()->data : 'NULL';
			if ($child->nodeName eq 'title') {
				$i++;
				$do = 1 if $i > 1;
			}
			next unless $do;
			chomp $data;

                        print "[DEBUG] node = '",  $child->nodeName, "'\n" if $DEBUG;
			if ($child->nodeName eq 'title') {
				print '<b>', $data, '</b>', "\n";
			}
			elsif ($child->nodeName eq 'link') {
				print '<a href="', $data, '">', $data, '"</a>', "\n";
			}
			elsif ($child->nodeName eq 'description') {
				print $data, '<br />', "\n";
			}
			elsif ($child->nodeName eq 'pubDate') {
				print '<i>(', $data, ')<br /></i>';
				print "<hr noshade>\n\n";
			}
			else {
				print $data, '<br />';
			}
                }
        }
}

__END__


rss_2_0.pl

Download: → /perl/ files/rss 2 0 pl.txt
#!/usr/bin/perl -w
use strict;
$| = 1;
################################
# rss_2_0.pl
#
# URL: http://www.infocopter.com/perl/rss.html
# Location: quartus
#
# Usage:
# rss_2_0.pl [Options] rssinput.xml "3" >output_top3.html
#
my $VERSION = '0.05.02';
################################

use XML::GDOME;
use Unicode::String qw(latin1 utf8); # For Umlaute problems -> web_enc / toISO
use Getopt::Long;

# -- GLOBAL
my $DEBUG = 0;
my $do = my $i = 0;
my %META = ();
# -- Don't make a target _blank to this domains:
my @HOME_DOMAINS = qw(infocopter.com pgate.net);

my @getopt_args = (
	'd',		# debug
	'debug' ,	# debug mode for development support
	'omit_tags=s' ,	# e.g. img,a,h1
	'h',		# help
	'proxy=s',	# proxy host
	'v',		# Verbose mode
);

my %Options;
Getopt::Long::config("noignorecase", "bundling");

print "\n";
&Usage() unless GetOptions(\%Options, @getopt_args);

if ($Options{'d'} or $Options{'debug'}) {
	foreach (keys %Options) { print "- $_ = \"$Options{$_}\"\n"; }
}
$Options{'omit_tags'} ||= '';

my $doc = XML::GDOME->createDocFromURI($ARGV[0] || '/var/www/html/rss/xml/linuxjournal.xml');
my @nodes = $doc->findnodes("//*");

&main();

###############################################
sub main() {
###############################################
	my @row_color = ();
	   $row_color[0] = 'white';
	   $row_color[1] = '#e9e9e9';

	#print '<table cellspacing="0" cellpadding="3" border="0" style="border-color:black;">';

	print '<tr><td>', scalar localtime, '<br /></td></tr>';

	my $i = 0;
	my $max = $ARGV[1] || 0;
	foreach (&parseRSS()) {
		my $hash_ref = $_;
		next if $hash_ref->{'title'} =~ /^ADV:/; # Skip advertisting

		last if (++$i > $max) && $max;

		(my $description = $hash_ref->{'description'}) =~ s/^<p>//gi;
		    $description =~ s/<\/p>$//gi;
		    $description =~ s/<\/p>/<br \/><br \/>/gi;
		    $description =~ s/<p>//gi;
		    $description =~ s/’/'/g;
		    $description =~ s/é/\&eacute;/g;
		    $description =~ s/è/\&egrave;/g;
		    if (&isISO($description)) {
		    	$description = &web_enc($description);
		    }
		    else {
		    	$description = &web_enc(&toISO($description));
		    }
		my @omit_tag_arr = split /,/, $Options{'omit_tags'};
		foreach (@omit_tag_arr) {
			$description =~ s/\<$_/\&lt;$_/g;
		}

		if ($hash_ref->{'__nodeName'} !~ /description/) {
			# Something else, probably the disclaimer section but to be generic we catch all ;-)
			print	'<tr bgcolor="white">',
				'<td style="border-bottom-style:dashed; ',
				  'border-bottom-color:#cccccc; border-bottom-width:1px;">';
			if (defined $hash_ref->{'url'}) {
				print '<a target="_blank" href="', $hash_ref->{'link'}, '">',
					'<img border="0" src="', $hash_ref->{'url'}, '" alt="',
					$hash_ref->{'title'}, '"></a>';
			}
			else {
				foreach (keys %{$hash_ref}) {
					print "$_ = ", $hash_ref->{$_}, '<br />';
				}
			}
			print '</td></tr>';
			$i--;
			next;
		}
		else {
			print '<tr bgcolor="', $row_color[$i % 2],
				'"><td style="border-bottom-style:dashed; ',
					'border-bottom-color:#cccccc; border-bottom-width:1px;">';
		}

		my $title = $hash_ref->{'title'};
		   $title =~ s/’/'/g;
		   $title =~ s/é/\&eacute;/g;
		   $title =~ s/è/\&egrave;/g;
		if (&isISO($title)) {
			$title = &web_enc($title);
		}
		else {
			$title = &web_enc(&toISO($title));
		}

		print '<span style="font-size:16px;font-weight:bold">', $title, '</span><br />',
			'<span style="font-size:10px">', $hash_ref->{'pubDate'}, '</span><br />',
			$description, '<br /> <br />', "\n";

		if ($hash_ref->{'category'}) {
			print '&rsaquo; <a href="', $hash_ref->{"category\tdomain"}, '">',
				$hash_ref->{'category'}, '</a> ';
		}

		(my $link_disp = $hash_ref->{'link'}) =~ s/^https?:\/\///i;
		    $link_disp =~ s/^www\.//i;
		if (length($link_disp) > 60) {
			$link_disp = substr($link_disp, 0, 60) . '...';
		}

		my $target = ' target="_blank"'; # default linking
		my $uarr = ' &uarr;';
		foreach (@HOME_DOMAINS) {
			if ($hash_ref->{'link'} =~ /$_/) {
				$target = $uarr = ''; last;
			}
		}
		print "&rsaquo; <a$target href=\"", $hash_ref->{'link'}, '">', $link_disp, "</a>$uarr<br />";
		print '</td></tr>', "\n\n";
	}

	my $meta_row = '';
	if (defined $META{'copyright'}) {
		$meta_row = $META{'copyright'};
	}
	if (defined $META{'managingEditor'}) {
		$meta_row .= "<br><a href=\"mailto:$META{'managingEditor'}\">Managing Editor</a>";
	}

	if ($meta_row) {
		print '<tr bgcolor="white"><td style="border-bottom-style:dashed; border-width:0;">';
		# foreach (keys %META) { print "$_ = $META{$_}<br />"; }
		print '<br /><i>', $meta_row, '</i>';
		print '</td></tr>';
	}

	#print '</table>';
}

###############################################
sub parseRSS() {
###############################################
	my @results = ();
	foreach my $node (@nodes) {
        	my @childs = $node->childNodes;

		my %hash = ();

		foreach my $child (@childs) {
	                next unless $child->nodeType == ELEMENT_NODE;

			my $data = defined $child->firstChild() ? 
        	                       $child->firstChild()->data : 'NULL';
			$META{$child->nodeName} = $data;
			if ($child->nodeName eq 'title') {
				$i++;
				$do = 1 if $i > 1;
			}
			next unless $do;
			chomp $data;

                        print "[DEBUG] node = '",  $child->nodeName, "'\n" if $DEBUG;
			$hash{$child->nodeName} = $data;
			$hash{'__nodeName'} .= $child->nodeName . '<br />';

			if (defined $child->getAttributeNode("domain")) {
				# -- <category domain="http://...
				my $href = $child->getAttributeNode("domain");
				$hash{$child->nodeName . "\tdomain"} = $href->getValue();
			}
        	}
		# -- Put an atomic RSS entry to the array and empty the hash
		my %foohash = %hash;
		$foohash{'pubDate'} ||= ''; # * No date *
		$foohash{'title'  } ||= '* No title *';
		$foohash{'description'} ||= '* No description *';

		$foohash{'category'} ||= ''; # empty!
		my $categ_url = $foohash{'category'} ?
			'http://groups.google.ch/groups?hl=de&q=' . $foohash{'category'} : '';
		$foohash{"category\tdomain"} ||= $categ_url;
		push(@results, \%foohash) if %hash;
		%hash = ();
	}
	@results;
}

sub web_enc ($) {
	my $enc = '';
	for (my $i = 0; $i < length($_[0]); $i++) {
		my $ordno = ord substr($_[0], $i, 1);
		$enc .= $ordno > 127 ? sprintf("&#%d;", $ordno) : substr($_[0], $i, 1);
	}

	$enc =~ s/ $//;;
	$enc;
}

sub toISO($) {
        my $text = $_[0];
        # if this host was UTF-8 encoded:
        my $text_iso  = (utf8($text))->latin1;
        my $text_utf8 = (latin1($text_iso))->utf8; # reverse check

        if ($text ne $text_utf8) {
                #print STDERR "Unequal reverse check! It seems your input data \"$text\" is ",
                #        "ISO encoded already, so you don't need the latin1 encoding stuff here!\n";
                return $text;
        }
        $text_iso;
}

sub isISO ($) {
	my $text = $_[0];
	local $SIG{'__WARN__'} = \&alarm_handler; # install signal handler
	my $text_iso  = (utf8($text))->latin1;
	my $text_utf8 = (latin1($text_iso))->utf8; # reverse check
	return $text eq $text_utf8 ? 0 : 1;
}

sub alarm_handler () {
	#print STDERR "alarm catched!\n";
	return;
}

sub Usage() {
	print "Wrong usage.\n";
	exit(0);
}

__END__



Advanced search tips
5 XML with Perl [  up  ] - [ top ] 5.2 XML RPC



[ home ] - [ search ] - [ feedback ]

copyright by reto - created with mytexi