#!/usr/bin/perl -w

## Copyright 2007 Andy Kelk (weather@mopoke.co.uk)
## Released under the Perl Artistic License
## see http://www.perl.com/pub/a/language/misc/Artistic.html

## +---------------------------------------------------------------------------+
## Overview
## This script parses weather data from the BBC Weather website. It then
## produces a 5-day RSS feed for that data.
##
## +---------------------------------------------------------------------------+
## Notes
## This script doesn't use CGI.pm or LWP as they proved to be a bit expensive on
## memory for the limited use made of them. Especially on the poor old server this
## ran on. Therefore, the query parsing is fragile, as is the fetching of data
## from the BBC. Also, the code looks a bit hacky.
##
## The data is scraped from an HTML page. As soon as the BBC update the layout
## of their page, there is a strong possibility this will break. Fixing it is left
## as an exercise for the reader.
##
## +---------------------------------------------------------------------------+
## Installation
## Requires the following modules from CPAN (and their associated dependencies):
##  * XML::RSS
##  * XML::Atom::SimpleFeed
##  * Date::Format
##  * IO::Socket::INET
##
## Then place this script in your cgi-bin directory, setting the appropriate
## permissions.
## The script also expects a cache directory that it can write to. (see the
## $datadir variable, below, to change the location of this.
##
## +---------------------------------------------------------------------------+
## Usage
## Called from your website e.g.:
## /cgi-bin/weather2rss.pl?location=2187&f=1&w=0&format=rss&d=0
##
## location : location ID from the BBC website. E.g. 2187 is Chigwell, UK
## f        : fahrenheit flag. If true, the output is in Fahrenheit.
## w        : world flag. If true, the ID is passed as "world" to the BBC (2187
##            is then Ganzhou, China).
## format   : rss or atom. Defaults to rss.
## d        : detailed flag. If true, the output contains more detail.
##
## +---------------------------------------------------------------------------+
## Apache Rewrite
## This was designed to be run via Apache using a rewrite rule so that links
## such as /weather/2187f.xml would be equivalent to the query string shown
## above in Usage.
##
## Here is a sample rewrite rule to do that:
## RewriteRule ^/weather/(atom)?/?(world)?/?([0-9]+)(f)?\.xml$ /cgi-bin/weather2rss.pl?location=$3&f=$4&w=$2&format=$1 [PT,L]

package weather2rss;
use strict;
use XML::RSS;
use XML::Atom::SimpleFeed;
use Date::Format;
use IO::Socket::INET;

## +---------------------------------------------------------------------------+
## Change these if you want!
our $hours = 3; ## How long do we cache the xml file for?
our $datadir = "/var/www/weatherdata/"; ## Where do we cache the data?
## +---------------------------------------------------------------------------+

main();

sub main {
    ## save memory by not calling CGI.pm - params are simple and we're coming
    ## from a URL rewrite anyway so we know what params are incoming...
    my @params = split(/&/,$ENV{QUERY_STRING});
    my %params;
    foreach (@params) {
        my ($key,$value) = split(/=/,$_);
        $params{$key} = $value;
    }
    $params{$_} ||= "" foreach qw(location f w format d);
    my $locationid = $params{'location'};
    $locationid =~ s/[^0-9]//g;
    failure("Invalid location supplied.") unless $locationid;

    my $fahrenheitflag = $params{'f'} eq "f" ? 1 : 0;
    my $worldflag = $params{'w'} eq "world" ? 1 : 0;
    my $format = $params{'format'} eq "atom" ? "atom" : "rss";
    my $detailed = $params{'d'} eq "detailed" ? 1 : 0;

    get_cache($locationid,$fahrenheitflag,$worldflag,$format,$detailed);

    my ($html,$url) = get_content($locationid,$fahrenheitflag,$worldflag);

    my $content = parse_content($html,$url,$fahrenheitflag,$detailed);

    my $xml = $format eq "atom" ? build_atom($content) : build_rss($content);

    print "Content-type: text/xml\n\n";
    print $xml;

    set_cache($locationid,$xml,$fahrenheitflag,$worldflag,$format,$detailed);
}

sub failure {
    my ($msg) = @_;
    print "Content-type: text/html\n\n";
    print "<html><body>Could not retrieve source page from BBC. $msg</body></html>";
    exit(0);
}

sub get_cache {
    my ($locationid,$fahrenheitflag,$worldflag,$format,$detailed) = @_;
    my $cachefile = $datadir .
        ($format eq "atom" ? "atom_":"") .
        ($detailed?"detail_":"") .
        ($worldflag?"world":"") .
        "$locationid" .
        ($fahrenheitflag?"f":"") .
        ".xml";
    if (-e $cachefile) {
        my @stat = stat($cachefile);
        if (time - $stat[9] < ($hours*60*60)) {
            if (open XMLFILE, $cachefile) {
                print "Content-type: text/xml\n\n";
                print $_ while (<XMLFILE>);
                close XMLFILE;
                exit(0);
            }
            else {
                return;
            }
        }
    }
}

sub set_cache {
    my ($locationid,$content,$fahrenheitflag,$worldflag,$format,$detailed) = @_;
    if (
        open XMLFILE, ">" . $datadir .
        ($format eq "atom" ? "atom_":"") .
        ($detailed?"detail_":"") .
        ($worldflag?"world":"") .
        "$locationid" .
        ($fahrenheitflag?"f":"") .
        ".xml"
    ) {
        print XMLFILE $content;
        close XMLFILE;
    }
}

sub get_content {
    my ($locationid,$fahrenheitflag,$worldflag) = @_;
    my $host = 'www.bbc.co.uk';
    my $baseurl = "/weather/5day" .
        ($fahrenheitflag?"_f":"") .
        ".shtml?" .
        ($worldflag?"world":"id") .
        "=";
    my $url = "$baseurl$locationid";

    ## save memory by creating a Socket directly and not using LWP
    my $sock = new IO::Socket::INET (
                       PeerAddr => $host,
                       PeerPort => '80',
                       Proto => 'tcp',
                      );
    failure($@) unless $sock;

    print $sock "GET $url HTTP/1.0\n";
    print $sock "Host: $host\n\n";
    my $content;
        while (<$sock>) {
                $content .= $_;
        }
    close $sock;
    if ($content =~ /HTTP\/1.1 (\d+) (.+)/) {
        if ($1 eq "200") {
            return $content,"http://$host$url";
        }
        else {
            failure("$1 $2");
        }
    }
    else {
        failure("no response");
    }
}

sub parse_content {
    my ($html,$url,$fahrenheitflag,$detailed) = @_;
    my $content;

    ## Grab the location namem from the header
    if ($html =~ m!<td bgcolor="#\d{6}" width="\d+" height="\d+" colspan="\d+" class="var2"><h1 style="text-transform:capitalize;">([^<]*)</h1>!) {
        $content->{locationname} = $1;
    }
    my $time = time;
    $content->{url} = $url;
    $content->{title} = "BBC weather : " . $content->{locationname};
    $content->{description} = "BBC weather : " . $content->{locationname};
    $content->{date} = time2str("%Y-%m-%dT%H:%M+%S:00", $time);
    $content->{rights} = 'BBC Weather Centre in association with the Met Office';
    $content->{contact} = 'weather@mopoke.co.uk';
    $content->{subject} = "Weather";

    ## Yikes! This huge regexp matches the HTML for each item. Good luck
    ## debugging it if it breaks :-)
    ## (I used Komodo's RX Toolkit to build it - it's super duper)

    while ($html =~ m!<td bgcolor="[^"]*" scope="row" class="weatherday" height="[^"]*">(?:<a name="results"><img[^>]*></a>)?(?:<img[^>]*>)?<strong>(?:[^<]*)<br /><img[^>]*>(?:.*?)<td[^>]*><div onmouseover="showSym[^>]*><img .*? alt="(.*?)'s predominant weather is forecast to be ([^.]*)\."[^>]*>(?:.*?)<td[^>]*><div[^>]*><img[^>]*><span class="temptxt"><strong>((?:-|\d)+)<abbr title="Day Temperature(?:.*?)<td[^>]*><div[^>]*><img[^>]*><span class="temptxt"><strong>((?:-|\d)+)<abbr title="Night Temperature(?:.*?)<td[^>]*><div[^>]*><img src="[^"]*" width="50" height="55" alt="[^ ]* wind direction: (.*?) Wind."[^>]*><span class="windtxt"><strong><abbr title="Miles per hour">(\d+)</abbr>(?:.*?)<td[^>]*>&nbsp;<strong><abbr title="Visibility">([^<]*)</abbr>(?:.*?)<td[^>]*>&nbsp;<strong><abbr title="Pressure in Millibars">(\d+)</abbr>(?:.*?)<td[^>]*>&nbsp;<strong><abbr title="Relative humidity in percent">(\d+)</abbr>!sgi) {
        my $day      = $1;
        my $summary  = $2;
        my $max      = $3;
        my $min      = $4;
        my $winddir  = $5;
        my $windmph  = $6;
        my $visi     = $7;
        my $pressure = $8;
        my $humidity = $9;
        my $desc     = uc(substr($summary,0,1)) .
            substr($summary,1) .
            ". Max: $max" .
            ($fahrenheitflag?"F":"C") .
            ", Min: $min" .
            ($fahrenheitflag?"F":"C") .
            ".";
        if ($detailed) {
            $desc .= " Wind: $winddir, $windmph Mph.";
            $desc .= " Visibility: ".uc(substr($visi,0,1)).substr($visi,1).".";
            $desc .= " Pressure: $pressure mb.";
            $desc .= " Relative Humidity: $humidity%.";
        };
        push @{$content->{items}},
        {
          day => $day,
          link => $url,
          date => time2str("%Y-%m-%dT%H:%M+%S:00", $time),
          description => $desc
        };
    }
    return $content;
}

sub build_rss {
    my ($content) = @_;
    my $rss = new XML::RSS (version => '1.0');
    $rss->channel(
       title        => $content->{title},
       link         => $content->{url},
       description  => $content->{description},
       dc => {
         date       => $content->{date},
         subject    => $content->{subject},
         creator    => $content->{contact},
         publisher  => 'http://www.bbc.co.uk/weather',
         rights     => $content->{rights},
         language   => 'en-gb',
       },
       syn => {
         updatePeriod     => "daily",
         updateFrequency  => int(24/$hours),
         updateBase       => "1901-01-01T00:00+00:00",
       },
       taxo => [
         'http://dmoz.org/News/Weather'
       ]
     );

    foreach my $item (@{$content->{items}}) {
        $rss->add_item(
          title       => $item->{day},
          link        => $item->{link},
          dc => {
                date => $item->{date},
            },
          description => $item->{description}
        );
    }
    return $rss->as_string;
}

sub build_atom {
    my ($content) = @_;
    my $atom = XML::Atom::SimpleFeed->new(
        title     => $content->{title},
        link      => $content->{url},
        tagline   => $content->{description},
        copyright => $content->{rights},
        author    => {name => $content->{contact}, email => $content->{contact}}
    );

    if ($ENV{HTTP_HOST} =~ m!http://([^/]+)/?!) {
        $content->{hostpart} = $1;
    }
    else {
        $content->{hostpart} = 'www.mopoke.co.uk';
    }
    my $datepart = substr($content->{date},0,10);

    foreach my $item (@{$content->{items}}) {
        $atom->add_entry(
            title    => $item->{day},
            link     => $item->{link},
            content  => $item->{description},
            subject  => $content->{subject},
            id       => "tag:$content->{hostpart},$datepart:$item->{day}",
        );
    }
    return $atom->as_string();
}


