yahtv.pl
- #!/usr/bin/perl
- #
- # Title: yahtv.pl
- #
- # Date: 09/28/09
- #
- # Creator: Mike Martinet, southrustern.com
- #
- # This software is free to use, modify or give away. I ask only that my
- # name and URL be included with any distribution. I accept no liability
- # for any misuse, infringement or damage caused by the running of this script.
- #
- # The current version of this script is available at:
- # http://www.southrustern.com/code/yahtv.html
- #
- # Overview:
- #
- # Mythtv is a software system which will allow a computer equipped with a TV
- # capture card to be used as a DVR (Digital Video Recorder).
- #
- # Copies of Mythtv are available at:
- # http://www.mythtv.org/
- #
- # To get full use out of Mythtv, it is necessary to subscribe to an EPG (Electronic
- # Program Guide) Subscription services (as of this date) cost less than $2US/month.
- #
- # More info:
- # http://www.mythtv.org/wiki/Schedules_Direct
- # http://www.mythtv.org/wiki/DataDirect
- #
- # Subscription service provided by Schedules Direct at:
- # http://www.schedulesdirect.org/
- #
- # By all means, if you're in the least hesitant to run this kludge, please support
- # Schedules Direct instead.
- #
- # This script was NOT written because subscription services are too expensive!
- # Rather, this script was written because there's no fun or challenge in simply
- # handing over $20 a year.
- #
- # This script will not supply descriptions for the listings - yet another good
- # reason to just pay the twenty bucks.
- #
- # Description:
- #
- # Yahtv.pl queries a Yahoo Pipe (which I wrote) for television listings. It parses
- # the returned XML file and writes an XMLTV standard XML file which can be used to
- # update mythtv program listings.
- #
- # The Yahoo Pipe is located here:
- #
- # http://pipes.yahoo.com/pipes/pipe.info?_id=294b9e661664ed7e47874fbbb4c44f7d
- #
- # Usage: (On Linux systems)
- #
- # You might want to change the "PROVIDER" variable below to the six-digit code
- # that Yahoo uses to identify your particular television viewing location.
- #
- # This script can be run from the command line with or without arguments.
- # Without arguments, it will first look to see if an earlier file exists.
- # If so, it will call the Yahoo Pipe for a day later than the existing
- # file. If no existing file is found, it will call the Pipe with a date
- # twelve days in the future. That's because on any given day, the Yahoo
- # Listings are twelve days out. Once you get this set up and you have a
- # complete set of listings pulled into Mythtv, you can keep up-to-date
- # either by running yahtv by hand and updating Mythtv, or wrapping it
- # up in a shell script. (I may do this and will post the shell script
- # if so)
- #
- # This script can be run with arguments to capture the data from a specific
- # day, as in, yahtv.pl 10 1, which will get the day's listings for October
- # first of the current year.
- #
- # The script calls the Pipe eight times in three hour chunks each, starting
- # at midnight of the day requested. This is because Yahoo TV Listings will
- # only display three hours of programming at a time.
- #
- # The script will attempt to write its output to $HOME/xmltv. If the
- # directory doesn't exist, the script will attempt to write to the current
- # directory.
- #
- # The resultant XML file will be named as xmltv.YYYYMMDD.xml. This file can
- # then be fed to mythfilldatabase like so:
- #
- # mythfilldatabase --file 1 xmltv.YYYYMMDD.xml
- #
- # where "1" is the number of the capture card associated with the TV source.
- #
- # Updates:
- #
- # Someday, Yahoo will change the layout of its listing pages. At that
- # time, this script will most likely fail. It remains to be seen if I will
- # spend the time to update it, or start handing over the $20 as mentioned above.
- #
- # Disclaimer:
- #
- # It's quite possible that Yahoo may not like its TV listings being "scraped".
- # However, these things are true today:
- #
- # 1 - You can't (in the US, at least) copyright facts (Telephone numbers, sports
- # scores, train schedules, etc)
- # 2 - Yahoo's TV Listings require no subscription and are available to the public.
- # 3 - There is no robot rule forbidding "crawling" of the Yahoo TV listings.
- # 4 - Yahoo does not specifically forbid its Pipes from accessing its Listings.
- # 5 - Yahoo's Terms of Service (http://info.yahoo.com/legal/us/yahoo/utos/utos-173.html)
- # includes no proscription against the private use of its content in this manner
- use strict;
- use LWP::Simple;
- use URI::Escape;
- use Time::Local;
- my( $inMon, $inDay ) = @ARGV;
- my $DEBUG = 0;
- # Set this to 1 to write a channel listing at the beginning of the XML
- # This should only need to be done once, and only if you don't already
- # have a valid channel list
- my $GETCHANS = 0;
- my $PROVIDER = "330500";
- my $home = $ENV{ 'HOME' };
- my $OUTDIR = "$home/xmltv";
- if( !( -d $OUTDIR ) )
- if( $DEBUG > 0 )
- { print "OUTDIR = $OUTDIR\n"; }
- # Three hours in unix timestamp
- my $THREEH = 10800;
- my $DAY = 86400;
- my( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst, $end );
- my( $idx, $from, @files, $file, $sTimeDay, );
- # No arguments?
- if( !defined( $inDay ) || !defined( $inMon ) )
- {
- # Get a list of the previous files
- @files = <$OUTDIR/xmltv*.xml>;
- # Make an array of just the date part of the filenames
- foreach( @files )
- # Pick the last date part off the array
- $file = pop( @files );
- # If no previous files, use date 12 days from now
- if( !defined( $file ) )
- {
- # We really only want today's month/day/year, but get it all, what the hell
- ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = &getLTime(time());
- $DAY *= 11;
- }
- else
- # Break the date string down into atoms
- { ( $year, $mon, $mday ) = ( $file =~ /(\d{4})(\d{2})(\d{2})/ ); }
- $mon -= 1;
- # Get the date in unix timestamp form
- $sTimeDay = timelocal(0,0,0,$mday,$mon,($year-1900)); #$sec,$min,$hour,$mday,$mon,$year);
- # Increment the timestamp by either one or twelve days
- $sTimeDay += $DAY;
- }
- else
- {
- $inMon -= 1;
- # Get a timestamp for the month/day input
- $sTimeDay = timelocal(0,0,0,$inDay,$inMon,109); #$sec,$min,$hour,$mday,$mon,$year);
- }
- my $today = time();
- # Get time atoms for the timestamp
- ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = &getLTime( $sTimeDay );
- my $fileDate = $year.$mon.$mday;
- my $OUTFILE = "$OUTDIR/xmltv.$fileDate.xml";
- $from = $hour;
- print "\n";
- print "Start date = ".localtime( $sTimeDay )."\n";;
- print "sTimeDay = $sTimeDay\n";
- print "\n";
- # Delete old files - non-portable, must fix
- my $cmd = "[[ -e $OUTDIR/$OUTFILE ]] && rm -f $OUTDIR/$OUTFILE";
- system( $cmd );
- my $cmd = "[[ -e $OUTDIR/tmp.xml ]] && rm -f $OUTDIR/tmp.xml";
- system( $cmd );
- open FH, "+>>$OUTFILE";
- print FH "<?xml version='1.0' encoding='UTF-8'?>\n";
- print FH "<!DOCTYPE tv SYSTEM 'xmltv.dtd'>\n";
- print FH "<tv generator-info-name='yahtv'>\n";
- # Opens a file for a copy of the raw XML from yahoo pipes
- if( $DEBUG > 0 )
- { open( FH1, ">>$OUTDIR/tmp.xml" ); }
- for( $idx=0;$idx<8;$idx++ )
- {
- #print "sTimeDay = $sTimeDay\n";
- printf( "%s %02d-%02d\n", "Getting", $from, ( $from + 3 ) );
- &getListings( $sTimeDay, *FH1 );
- $sTimeDay += $THREEH;
- $from += 3;
- }
- print FH "</tv>\n";
- close FH;
- if( $DEBUG > 0 )
- print "\n";
- # End
- #
- #-------------------------------------------------------------------------------------------#
- #-------------------------------------------------------------------------------------------#
- sub getListings
- #-------------------------------------------------------------------------------------------#
- {
- my( $startDay, $FH1 ) = @_;
- my( @newArr, $line, $lastline, @dbArr, $dur, $start, $chan, $sign, $title );
- my( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst, $end );
- my( $xStart, $xEnd, $lastChan, );
- my $query = "http://tv.yahoo.com/listings?gridtype=full&starttime_day=$startDay";
- $query.= "&provider=$PROVIDER";
- if( $DEBUG > 0 )
- { print "query = $query\n"; }
- $query = "TimeSlot=".uri_escape( $query );
- if( $DEBUG > 0 )
- { print "http://pipes.yahoo.com/pipes/pipe.run?$query&_id=294b9e661664ed7e47874fbbb4c44f7d&_render=rss\n"; }
- my $xml = get("http://pipes.yahoo.com/pipes/pipe.run?$query&_id=294b9e661664ed7e47874fbbb4c44f7d&_render=rss" );
- my @xmlArr = split( "\n", $xml );
- # remove the xml wrappers from the data
- foreach my $line ( @xmlArr )
- {
- if( $line =~ /<title\>/ && $line !~ /Yahoo/ )
- {
- $line =~ s/<title\>|<\/title\>//g;
- push( @newArr, $line );
- }
- }
- # Iterate through the array using dur_ as a delimiter
- while( $line = shift( @newArr ) )
- {
- if( $line =~ /dur_/ )
- {
- $lastline = $line;
- $line = shift( @newArr );
- while( $line !~ /dur_/ && $line ne "" )
- {
- # Append the data to the line
- $lastline.= $line;
- $line = shift( @newArr );
- }
- # Back up at next dur_
- unshift( @newArr, $line );
- $lastline =~ s/\s+/ /g;
- # Push one full listing into the array
- if( $lastline =~ /start_/ )
- {
- push( @dbArr, $lastline );
- }
- $lastline = "";
- $line = "";
- }
- }
- # Channel scan
- if( $GETCHANS > 0 )
- {
- foreach $line ( @dbArr )
- {
- # Really nifty grep trick to pull out four individual strings and then whatevers left over
- if( ($dur, $start, $chan, $sign, $title) = ( $line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s*(.*)/ ) )
- {
- if( $chan ne $lastChan )
- {
- print FH " <channel id='$chan'>\n";
- print FH " <display-name lang='en'>$sign</display-name>\n";
- print FH " </channel>\n";
- $lastChan = $chan;
- }
- }
- }
- }
- # XML format the array to xmltv.dtd, mythtv style
- foreach $line ( @dbArr )
- {
- if( ($dur, $start, $chan, $sign, $title) = ( $line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s*(.*)/ ) )
- {
- # Strip the trailing underscore from all items
- foreach( $dur, $start, $chan, $sign )
- $end = ( ( $dur * 60 ) + $start );
- ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = &getLTime($start);
- # Format a pretty time string for debug - not used anymore
- $start = "$year-$mon-$mday-$hour:$min";
- $xStart = $year.$mon.$mday.$hour.$min."00";
- ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = &getLTime( $end );
- $end = "$year-$mon-$mday-$hour:$min";
- $xEnd = $year.$mon.$mday.$hour.$min."00";
- if( $DEBUG > 0 )
- {
- print "\ndur = $dur\n";
- print "xStart = $xStart\n";
- print "xEnd = $xEnd\n";
- print "chan = $chan\n";
- print "sign = $sign\n";
- print "title = $title\n";
- }
- # Finally. Print the listing to the XML file
- print FH " <programme channel='".$chan."' start='$xStart' stop='$xEnd'>\n";
- print FH " <title lang='en'>'$title'</title>\n";
- print FH " </programme>\n";
- }
- }
- }
- #-------------------------------------------------------------------------------------------#
- #-------------------------------------------------------------------------------------------#
- # Returns formatted time atoms from input timestamp
- #-------------------------------------------------------------------------------------------#
- sub getLTime
- #-------------------------------------------------------------------------------------------#
- {
- my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst);
- ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime( $inTime );
- $mon += 1;
- foreach($sec,$min,$hour,$mday,$mon,$wday)
- { $_ = sprintf( "%02d", $_ ) }
- $year += 1900;
- return( $sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst );
- }
- #-------------------------------------------------------------------------------------------#