2008-12-22 19:38:31 +00:00
< ? php
// $Id$
/**
* @ file
* Parser functions for the aggregator module .
*/
/**
2009-05-27 18:34:03 +00:00
* Implement hook_aggregator_parse_info () .
2008-12-22 19:38:31 +00:00
*/
function aggregator_aggregator_parse_info () {
return array (
'title' => t ( 'Default parser' ),
'description' => t ( 'Parses RSS, Atom and RDF feeds.' ),
);
}
/**
2009-05-27 18:34:03 +00:00
* Implement hook_aggregator_parse () .
2008-12-22 19:38:31 +00:00
*/
function aggregator_aggregator_parse ( $feed ) {
global $channel , $image ;
// Filter the input data.
if ( aggregator_parse_feed ( $feed -> source_string , $feed )) {
$modified = empty ( $feed -> http_headers [ 'Last-Modified' ]) ? 0 : strtotime ( $feed -> http_headers [ 'Last-Modified' ]);
// Prepare the channel data.
foreach ( $channel as $key => $value ) {
$channel [ $key ] = trim ( $value );
}
// Prepare the image data (if any).
foreach ( $image as $key => $value ) {
$image [ $key ] = trim ( $value );
}
2009-05-18 09:41:40 +00:00
if ( ! empty ( $image [ 'link' ]) && ! empty ( $image [ 'url' ]) && ! empty ( $image [ 'title' ])) {
2009-10-09 01:00:08 +00:00
$image = l ( theme ( 'image' , array ( 'path' => $image [ 'url' ], 'alt' => $image [ 'title' ])), $image [ 'link' ], array ( 'html' => TRUE ));
2008-12-22 19:38:31 +00:00
}
else {
$image = '' ;
}
$etag = empty ( $feed -> http_headers [ 'ETag' ]) ? '' : $feed -> http_headers [ 'ETag' ];
2009-10-12 15:54:59 +00:00
// Add parsed data to the feed object.
$feed -> link = ! empty ( $channel [ 'LINK' ]) ? $channel [ 'LINK' ] : '' ;
$feed -> description = ! empty ( $channel [ 'DESCRIPTION' ]) ? $channel [ 'DESCRIPTION' ] : '' ;
$feed -> image = $image ;
$feed -> etag = $etag ;
$feed -> modified = $modified ;
2008-12-22 19:38:31 +00:00
// Clear the cache.
cache_clear_all ();
2009-10-12 15:54:59 +00:00
return TRUE ;
2008-12-22 19:38:31 +00:00
}
2009-10-12 15:54:59 +00:00
return FALSE ;
2008-12-22 19:38:31 +00:00
}
/**
* Parse a feed and store its items .
*
* @ param $data
* The feed data .
* @ param $feed
* An object describing the feed to be parsed .
* @ return
* FALSE on error , TRUE otherwise .
*/
function aggregator_parse_feed ( & $data , $feed ) {
global $items , $image , $channel ;
// Unset the global variables before we use them.
unset ( $GLOBALS [ 'element' ], $GLOBALS [ 'item' ], $GLOBALS [ 'tag' ]);
$items = array ();
$image = array ();
$channel = array ();
// Parse the data.
$xml_parser = drupal_xml_parser_create ( $data );
xml_set_element_handler ( $xml_parser , 'aggregator_element_start' , 'aggregator_element_end' );
xml_set_character_data_handler ( $xml_parser , 'aggregator_element_data' );
if ( ! xml_parse ( $xml_parser , $data , 1 )) {
watchdog ( 'aggregator' , 'The feed from %site seems to be broken, due to an error "%error" on line %line.' , array ( '%site' => $feed -> title , '%error' => xml_error_string ( xml_get_error_code ( $xml_parser )), '%line' => xml_get_current_line_number ( $xml_parser )), WATCHDOG_WARNING );
drupal_set_message ( t ( 'The feed from %site seems to be broken, because of error "%error" on line %line.' , array ( '%site' => $feed -> title , '%error' => xml_error_string ( xml_get_error_code ( $xml_parser )), '%line' => xml_get_current_line_number ( $xml_parser ))), 'error' );
return FALSE ;
}
xml_parser_free ( $xml_parser );
// We reverse the array such that we store the first item last, and the last
// item first. In the database, the newest item should be at the top.
$items = array_reverse ( $items );
// Initialize items array.
$feed -> items = array ();
foreach ( $items as $item ) {
// Prepare the item:
foreach ( $item as $key => $value ) {
$item [ $key ] = trim ( $value );
}
// Resolve the item's title. If no title is found, we use up to 40
// characters of the description ending at a word boundary, but not
// splitting potential entities.
2009-05-18 09:41:40 +00:00
if ( ! empty ( $item [ 'title' ])) {
$item [ 'title' ] = $item [ 'title' ];
2008-12-22 19:38:31 +00:00
}
2009-05-18 09:41:40 +00:00
elseif ( ! empty ( $item [ 'description' ])) {
$item [ 'title' ] = preg_replace ( '/^(.*)[^\w;&].*?$/' , " \\ 1 " , truncate_utf8 ( $item [ 'description' ], 40 ));
2008-12-22 19:38:31 +00:00
}
else {
2009-05-18 09:41:40 +00:00
$item [ 'title' ] = '' ;
2008-12-22 19:38:31 +00:00
}
// Resolve the items link.
2009-05-18 09:41:40 +00:00
if ( ! empty ( $item [ 'link' ])) {
$item [ 'link' ] = $item [ 'link' ];
2008-12-22 19:38:31 +00:00
}
else {
2009-05-18 09:41:40 +00:00
$item [ 'link' ] = $feed -> link ;
2008-12-22 19:38:31 +00:00
}
2009-05-18 09:41:40 +00:00
$item [ 'guid' ] = isset ( $item [ 'guid' ]) ? $item [ 'guid' ] : '' ;
2008-12-22 19:38:31 +00:00
2009-05-18 09:41:40 +00:00
// Atom feeds have a content and/or summary tag instead of a description tag.
if ( ! empty ( $item [ 'content:encoded' ])) {
$item [ 'description' ] = $item [ 'content:encoded' ];
2008-12-22 19:38:31 +00:00
}
2009-05-18 09:41:40 +00:00
elseif ( ! empty ( $item [ 'summary' ])) {
$item [ 'description' ] = $item [ 'summary' ];
2008-12-22 19:38:31 +00:00
}
2009-05-18 09:41:40 +00:00
elseif ( ! empty ( $item [ 'content' ])) {
$item [ 'description' ] = $item [ 'content' ];
2008-12-22 19:38:31 +00:00
}
// Try to resolve and parse the item's publication date.
$date = '' ;
2009-05-18 09:41:40 +00:00
foreach ( array ( 'pubdate' , 'dc:date' , 'dcterms:issued' , 'dcterms:created' , 'dcterms:modified' , 'issued' , 'created' , 'modified' , 'published' , 'updated' ) as $key ) {
2008-12-22 19:38:31 +00:00
if ( ! empty ( $item [ $key ])) {
$date = $item [ $key ];
break ;
}
}
2009-05-18 09:41:40 +00:00
$item [ 'timestamp' ] = strtotime ( $date );
2008-12-22 19:38:31 +00:00
2009-05-18 09:41:40 +00:00
if ( $item [ 'timestamp' ] === FALSE ) {
$item [ 'timestamp' ] = aggregator_parse_w3cdtf ( $date ); // Aggregator_parse_w3cdtf() returns FALSE on failure.
2008-12-22 19:38:31 +00:00
}
2009-07-15 21:32:43 +00:00
// Resolve dc:creator tag as the item author if author tag is not set.
if ( empty ( $item [ 'author' ]) && ! empty ( $item [ 'dc:creator' ])) {
$item [ 'author' ] = $item [ 'dc:creator' ];
}
2009-05-18 09:41:40 +00:00
$item += array ( 'author' => '' , 'description' => '' );
2008-12-22 19:38:31 +00:00
// Store on $feed object. This is where processors will look for parsed items.
$feed -> items [] = $item ;
}
return TRUE ;
}
/**
* Callback function used by the XML parser .
*/
function aggregator_element_start ( $parser , $name , $attributes ) {
global $item , $element , $tag , $items , $channel ;
2009-05-18 09:41:40 +00:00
$name = strtolower ( $name );
2008-12-22 19:38:31 +00:00
switch ( $name ) {
2009-05-18 09:41:40 +00:00
case 'image' :
case 'textinput' :
case 'content' :
case 'summary' :
case 'tagline' :
case 'subtitle' :
case 'logo' :
case 'info' :
2008-12-22 19:38:31 +00:00
$element = $name ;
break ;
2009-05-18 09:41:40 +00:00
case 'id' :
if ( $element != 'item' ) {
2008-12-22 19:38:31 +00:00
$element = $name ;
}
2009-05-18 09:41:40 +00:00
case 'link' :
if ( ! empty ( $attributes [ 'rel' ]) && $attributes [ 'rel' ] == 'alternate' ) {
if ( $element == 'item' ) {
$items [ $item ][ 'link' ] = $attributes [ 'href' ];
2008-12-22 19:38:31 +00:00
}
else {
2009-05-18 09:41:40 +00:00
$channel [ 'link' ] = $attributes [ 'href' ];
2008-12-22 19:38:31 +00:00
}
}
break ;
2009-05-18 09:41:40 +00:00
case 'item' :
2008-12-22 19:38:31 +00:00
$element = $name ;
$item += 1 ;
break ;
2009-05-18 09:41:40 +00:00
case 'entry' :
$element = 'item' ;
2008-12-22 19:38:31 +00:00
$item += 1 ;
break ;
}
$tag = $name ;
}
/**
* Call - back function used by the XML parser .
*/
function aggregator_element_end ( $parser , $name ) {
global $element ;
switch ( $name ) {
2009-05-18 09:41:40 +00:00
case 'image' :
case 'textinput' :
case 'item' :
case 'entry' :
case 'content' :
case 'info' :
2008-12-22 19:38:31 +00:00
$element = '' ;
break ;
2009-05-18 09:41:40 +00:00
case 'id' :
if ( $element == 'id' ) {
2008-12-22 19:38:31 +00:00
$element = '' ;
}
}
}
/**
* Callback function used by the XML parser .
*/
function aggregator_element_data ( $parser , $data ) {
global $channel , $element , $items , $item , $image , $tag ;
$items += array ( $item => array ());
switch ( $element ) {
2009-05-18 09:41:40 +00:00
case 'item' :
2008-12-22 19:38:31 +00:00
$items [ $item ] += array ( $tag => '' );
$items [ $item ][ $tag ] .= $data ;
break ;
2009-05-18 09:41:40 +00:00
case 'image' :
case 'logo' :
2008-12-22 19:38:31 +00:00
$image += array ( $tag => '' );
$image [ $tag ] .= $data ;
break ;
2009-05-18 09:41:40 +00:00
case 'link' :
2008-12-22 19:38:31 +00:00
if ( $data ) {
$items [ $item ] += array ( $tag => '' );
$items [ $item ][ $tag ] .= $data ;
}
break ;
2009-05-18 09:41:40 +00:00
case 'content' :
$items [ $item ] += array ( 'content' => '' );
$items [ $item ][ 'content' ] .= $data ;
2008-12-22 19:38:31 +00:00
break ;
2009-05-18 09:41:40 +00:00
case 'summary' :
$items [ $item ] += array ( 'summary' => '' );
$items [ $item ][ 'summary' ] .= $data ;
2008-12-22 19:38:31 +00:00
break ;
2009-05-18 09:41:40 +00:00
case 'tagline' :
case 'subtitle' :
$channel += array ( 'description' => '' );
$channel [ 'description' ] .= $data ;
2008-12-22 19:38:31 +00:00
break ;
2009-05-18 09:41:40 +00:00
case 'info' :
case 'id' :
case 'textinput' :
2008-12-22 19:38:31 +00:00
// The sub-element is not supported. However, we must recognize
// it or its contents will end up in the item array.
break ;
default :
$channel += array ( $tag => '' );
$channel [ $tag ] .= $data ;
}
}
/**
* Parse the W3C date / time format , a subset of ISO 8601.
*
* PHP date parsing functions do not handle this format .
* See http :// www . w3 . org / TR / NOTE - datetime for more information .
* Originally from MagpieRSS ( http :// magpierss . sourceforge . net / ) .
*
* @ param $date_str
* A string with a potentially W3C DTF date .
* @ return
* A timestamp if parsed successfully or FALSE if not .
*/
function aggregator_parse_w3cdtf ( $date_str ) {
if ( preg_match ( '/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/' , $date_str , $match )) {
list ( $year , $month , $day , $hours , $minutes , $seconds ) = array ( $match [ 1 ], $match [ 2 ], $match [ 3 ], $match [ 4 ], $match [ 5 ], $match [ 6 ]);
// Calculate the epoch for current date assuming GMT.
$epoch = gmmktime ( $hours , $minutes , $seconds , $month , $day , $year );
if ( $match [ 10 ] != 'Z' ) { // Z is zulu time, aka GMT
list ( $tz_mod , $tz_hour , $tz_min ) = array ( $match [ 8 ], $match [ 9 ], $match [ 10 ]);
// Zero out the variables.
if ( ! $tz_hour ) {
$tz_hour = 0 ;
}
if ( ! $tz_min ) {
$tz_min = 0 ;
}
$offset_secs = (( $tz_hour * 60 ) + $tz_min ) * 60 ;
// Is timezone ahead of GMT? If yes, subtract offset.
if ( $tz_mod == '+' ) {
$offset_secs *= - 1 ;
}
$epoch += $offset_secs ;
}
return $epoch ;
}
else {
return FALSE ;
}
}