2008-12-22 19:38:31 +00:00
< ? php
/**
* @ file
* Parser functions for the aggregator module .
*/
/**
2009-12-04 16:49:48 +00:00
* Implements hook_aggregator_parse_info () .
2008-12-22 19:38:31 +00:00
*/
function aggregator_aggregator_parse_info () {
return array (
'title' => t ( 'Default parser' ),
'description' => t ( 'Parses RSS, Atom and RDF feeds.' ),
);
}
/**
2009-12-04 16:49:48 +00:00
* Implements hook_aggregator_parse () .
2008-12-22 19:38:31 +00:00
*/
function aggregator_aggregator_parse ( $feed ) {
global $channel , $image ;
// Filter the input data.
if ( aggregator_parse_feed ( $feed -> source_string , $feed )) {
2010-05-20 08:51:24 +00:00
$modified = empty ( $feed -> http_headers [ 'last-modified' ]) ? 0 : strtotime ( $feed -> http_headers [ 'last-modified' ]);
2008-12-22 19:38:31 +00:00
// Prepare the channel data.
foreach ( $channel as $key => $value ) {
$channel [ $key ] = trim ( $value );
}
// Prepare the image data (if any).
foreach ( $image as $key => $value ) {
$image [ $key ] = trim ( $value );
}
2010-05-20 08:51:24 +00:00
$etag = empty ( $feed -> http_headers [ 'etag' ]) ? '' : $feed -> http_headers [ 'etag' ];
2009-10-12 15:54:59 +00:00
// Add parsed data to the feed object.
2011-10-10 00:52:35 +00:00
$feed -> link = ! empty ( $channel [ 'link' ]) ? $channel [ 'link' ] : '' ;
$feed -> description = ! empty ( $channel [ 'description' ]) ? $channel [ 'description' ] : '' ;
$feed -> image = ! empty ( $image [ 'url' ]) ? $image [ 'url' ] : '' ;
2009-10-12 15:54:59 +00:00
$feed -> etag = $etag ;
$feed -> modified = $modified ;
2008-12-22 19:38:31 +00:00
2012-06-13 01:37:07 +00:00
// Clear the page and block caches.
cache_invalidate ( array ( 'content' => TRUE ));
2008-12-22 19:38:31 +00:00
2009-10-12 15:54:59 +00:00
return TRUE ;
2008-12-22 19:38:31 +00:00
}
2009-10-12 15:54:59 +00:00
return FALSE ;
2008-12-22 19:38:31 +00:00
}
/**
2011-11-10 03:02:40 +00:00
* Parses a feed and stores its items .
2008-12-22 19:38:31 +00:00
*
* @ param $data
* The feed data .
* @ param $feed
* An object describing the feed to be parsed .
2011-11-10 03:02:40 +00:00
*
2008-12-22 19:38:31 +00:00
* @ return
* FALSE on error , TRUE otherwise .
*/
function aggregator_parse_feed ( & $data , $feed ) {
global $items , $image , $channel ;
// Unset the global variables before we use them.
unset ( $GLOBALS [ 'element' ], $GLOBALS [ 'item' ], $GLOBALS [ 'tag' ]);
$items = array ();
$image = array ();
$channel = array ();
// Parse the data.
$xml_parser = drupal_xml_parser_create ( $data );
xml_set_element_handler ( $xml_parser , 'aggregator_element_start' , 'aggregator_element_end' );
xml_set_character_data_handler ( $xml_parser , 'aggregator_element_data' );
if ( ! xml_parse ( $xml_parser , $data , 1 )) {
2011-10-01 19:47:01 +00:00
watchdog ( 'aggregator' , 'The feed from %site seems to be broken due to an error "%error" on line %line.' , array ( '%site' => $feed -> title , '%error' => xml_error_string ( xml_get_error_code ( $xml_parser )), '%line' => xml_get_current_line_number ( $xml_parser )), WATCHDOG_WARNING );
drupal_set_message ( t ( 'The feed from %site seems to be broken because of error "%error" on line %line.' , array ( '%site' => $feed -> title , '%error' => xml_error_string ( xml_get_error_code ( $xml_parser )), '%line' => xml_get_current_line_number ( $xml_parser ))), 'error' );
2008-12-22 19:38:31 +00:00
return FALSE ;
}
xml_parser_free ( $xml_parser );
// We reverse the array such that we store the first item last, and the last
// item first. In the database, the newest item should be at the top.
$items = array_reverse ( $items );
// Initialize items array.
$feed -> items = array ();
foreach ( $items as $item ) {
// Prepare the item:
foreach ( $item as $key => $value ) {
$item [ $key ] = trim ( $value );
}
// Resolve the item's title. If no title is found, we use up to 40
// characters of the description ending at a word boundary, but not
// splitting potential entities.
2009-05-18 09:41:40 +00:00
if ( ! empty ( $item [ 'title' ])) {
$item [ 'title' ] = $item [ 'title' ];
2008-12-22 19:38:31 +00:00
}
2009-05-18 09:41:40 +00:00
elseif ( ! empty ( $item [ 'description' ])) {
$item [ 'title' ] = preg_replace ( '/^(.*)[^\w;&].*?$/' , " \\ 1 " , truncate_utf8 ( $item [ 'description' ], 40 ));
2008-12-22 19:38:31 +00:00
}
else {
2009-05-18 09:41:40 +00:00
$item [ 'title' ] = '' ;
2008-12-22 19:38:31 +00:00
}
// Resolve the items link.
2009-05-18 09:41:40 +00:00
if ( ! empty ( $item [ 'link' ])) {
$item [ 'link' ] = $item [ 'link' ];
2008-12-22 19:38:31 +00:00
}
else {
2009-05-18 09:41:40 +00:00
$item [ 'link' ] = $feed -> link ;
2008-12-22 19:38:31 +00:00
}
2011-01-12 23:20:33 +00:00
// Atom feeds have an ID tag instead of a GUID tag.
if ( ! isset ( $item [ 'guid' ])) {
$item [ 'guid' ] = isset ( $item [ 'id' ]) ? $item [ 'id' ] : '' ;
}
2008-12-22 19:38:31 +00:00
2009-05-18 09:41:40 +00:00
// Atom feeds have a content and/or summary tag instead of a description tag.
if ( ! empty ( $item [ 'content:encoded' ])) {
$item [ 'description' ] = $item [ 'content:encoded' ];
2008-12-22 19:38:31 +00:00
}
2009-05-18 09:41:40 +00:00
elseif ( ! empty ( $item [ 'summary' ])) {
$item [ 'description' ] = $item [ 'summary' ];
2008-12-22 19:38:31 +00:00
}
2009-05-18 09:41:40 +00:00
elseif ( ! empty ( $item [ 'content' ])) {
$item [ 'description' ] = $item [ 'content' ];
2008-12-22 19:38:31 +00:00
}
// Try to resolve and parse the item's publication date.
$date = '' ;
2009-05-18 09:41:40 +00:00
foreach ( array ( 'pubdate' , 'dc:date' , 'dcterms:issued' , 'dcterms:created' , 'dcterms:modified' , 'issued' , 'created' , 'modified' , 'published' , 'updated' ) as $key ) {
2008-12-22 19:38:31 +00:00
if ( ! empty ( $item [ $key ])) {
$date = $item [ $key ];
break ;
}
}
2009-05-18 09:41:40 +00:00
$item [ 'timestamp' ] = strtotime ( $date );
2008-12-22 19:38:31 +00:00
2009-05-18 09:41:40 +00:00
if ( $item [ 'timestamp' ] === FALSE ) {
$item [ 'timestamp' ] = aggregator_parse_w3cdtf ( $date ); // Aggregator_parse_w3cdtf() returns FALSE on failure.
2008-12-22 19:38:31 +00:00
}
2009-07-15 21:32:43 +00:00
// Resolve dc:creator tag as the item author if author tag is not set.
if ( empty ( $item [ 'author' ]) && ! empty ( $item [ 'dc:creator' ])) {
$item [ 'author' ] = $item [ 'dc:creator' ];
}
2010-01-30 07:59:26 +00:00
2009-05-18 09:41:40 +00:00
$item += array ( 'author' => '' , 'description' => '' );
2008-12-22 19:38:31 +00:00
// Store on $feed object. This is where processors will look for parsed items.
$feed -> items [] = $item ;
}
return TRUE ;
}
/**
2011-11-10 03:02:40 +00:00
* Performs an action when an opening tag is encountered .
*
* Callback function used by xml_parse () within aggregator_parse_feed () .
2008-12-22 19:38:31 +00:00
*/
function aggregator_element_start ( $parser , $name , $attributes ) {
global $item , $element , $tag , $items , $channel ;
2009-05-18 09:41:40 +00:00
$name = strtolower ( $name );
2008-12-22 19:38:31 +00:00
switch ( $name ) {
2009-05-18 09:41:40 +00:00
case 'image' :
case 'textinput' :
case 'summary' :
case 'tagline' :
case 'subtitle' :
case 'logo' :
case 'info' :
2008-12-22 19:38:31 +00:00
$element = $name ;
break ;
2009-05-18 09:41:40 +00:00
case 'id' :
2010-07-24 17:42:22 +00:00
case 'content' :
2009-05-18 09:41:40 +00:00
if ( $element != 'item' ) {
2008-12-22 19:38:31 +00:00
$element = $name ;
}
2009-05-18 09:41:40 +00:00
case 'link' :
2010-07-24 17:42:22 +00:00
// According to RFC 4287, link elements in Atom feeds without a 'rel'
// attribute should be interpreted as though the relation type is
// "alternate".
if ( ! empty ( $attributes [ 'HREF' ]) && ( empty ( $attributes [ 'REL' ]) || $attributes [ 'REL' ] == 'alternate' )) {
2009-05-18 09:41:40 +00:00
if ( $element == 'item' ) {
2010-07-24 17:42:22 +00:00
$items [ $item ][ 'link' ] = $attributes [ 'HREF' ];
2008-12-22 19:38:31 +00:00
}
else {
2010-07-24 17:42:22 +00:00
$channel [ 'link' ] = $attributes [ 'HREF' ];
2008-12-22 19:38:31 +00:00
}
}
break ;
2009-05-18 09:41:40 +00:00
case 'item' :
2008-12-22 19:38:31 +00:00
$element = $name ;
$item += 1 ;
break ;
2009-05-18 09:41:40 +00:00
case 'entry' :
$element = 'item' ;
2008-12-22 19:38:31 +00:00
$item += 1 ;
break ;
}
$tag = $name ;
}
/**
2011-11-10 03:02:40 +00:00
* Performs an action when a closing tag is encountered .
*
* Callback function used by xml_parse () within aggregator_parse_feed () .
2008-12-22 19:38:31 +00:00
*/
function aggregator_element_end ( $parser , $name ) {
global $element ;
switch ( $name ) {
2009-05-18 09:41:40 +00:00
case 'image' :
case 'textinput' :
case 'item' :
case 'entry' :
case 'info' :
2008-12-22 19:38:31 +00:00
$element = '' ;
break ;
2009-05-18 09:41:40 +00:00
case 'id' :
2010-07-24 17:42:22 +00:00
case 'content' :
if ( $element == $name ) {
2008-12-22 19:38:31 +00:00
$element = '' ;
}
}
}
/**
2011-11-10 03:02:40 +00:00
* Performs an action when data is encountered .
*
* Callback function used by xml_parse () within aggregator_parse_feed () .
2008-12-22 19:38:31 +00:00
*/
function aggregator_element_data ( $parser , $data ) {
global $channel , $element , $items , $item , $image , $tag ;
$items += array ( $item => array ());
switch ( $element ) {
2009-05-18 09:41:40 +00:00
case 'item' :
2008-12-22 19:38:31 +00:00
$items [ $item ] += array ( $tag => '' );
$items [ $item ][ $tag ] .= $data ;
break ;
2009-05-18 09:41:40 +00:00
case 'image' :
case 'logo' :
2008-12-22 19:38:31 +00:00
$image += array ( $tag => '' );
$image [ $tag ] .= $data ;
break ;
2009-05-18 09:41:40 +00:00
case 'link' :
2008-12-22 19:38:31 +00:00
if ( $data ) {
$items [ $item ] += array ( $tag => '' );
$items [ $item ][ $tag ] .= $data ;
}
break ;
2009-05-18 09:41:40 +00:00
case 'content' :
$items [ $item ] += array ( 'content' => '' );
$items [ $item ][ 'content' ] .= $data ;
2008-12-22 19:38:31 +00:00
break ;
2009-05-18 09:41:40 +00:00
case 'summary' :
$items [ $item ] += array ( 'summary' => '' );
$items [ $item ][ 'summary' ] .= $data ;
2008-12-22 19:38:31 +00:00
break ;
2009-05-18 09:41:40 +00:00
case 'tagline' :
case 'subtitle' :
$channel += array ( 'description' => '' );
$channel [ 'description' ] .= $data ;
2008-12-22 19:38:31 +00:00
break ;
2009-05-18 09:41:40 +00:00
case 'info' :
case 'id' :
case 'textinput' :
2008-12-22 19:38:31 +00:00
// The sub-element is not supported. However, we must recognize
// it or its contents will end up in the item array.
break ;
default :
$channel += array ( $tag => '' );
$channel [ $tag ] .= $data ;
}
}
/**
2011-11-10 03:02:40 +00:00
* Parses the W3C date / time format , a subset of ISO 8601.
2008-12-22 19:38:31 +00:00
*
2011-11-10 03:02:40 +00:00
* PHP date parsing functions do not handle this format . See
* http :// www . w3 . org / TR / NOTE - datetime for more information . Originally from
* MagpieRSS ( http :// magpierss . sourceforge . net / ) .
2008-12-22 19:38:31 +00:00
*
* @ param $date_str
* A string with a potentially W3C DTF date .
2011-11-10 03:02:40 +00:00
*
2008-12-22 19:38:31 +00:00
* @ return
* A timestamp if parsed successfully or FALSE if not .
*/
function aggregator_parse_w3cdtf ( $date_str ) {
if ( preg_match ( '/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/' , $date_str , $match )) {
list ( $year , $month , $day , $hours , $minutes , $seconds ) = array ( $match [ 1 ], $match [ 2 ], $match [ 3 ], $match [ 4 ], $match [ 5 ], $match [ 6 ]);
// Calculate the epoch for current date assuming GMT.
$epoch = gmmktime ( $hours , $minutes , $seconds , $month , $day , $year );
if ( $match [ 10 ] != 'Z' ) { // Z is zulu time, aka GMT
list ( $tz_mod , $tz_hour , $tz_min ) = array ( $match [ 8 ], $match [ 9 ], $match [ 10 ]);
// Zero out the variables.
if ( ! $tz_hour ) {
$tz_hour = 0 ;
}
if ( ! $tz_min ) {
$tz_min = 0 ;
}
$offset_secs = (( $tz_hour * 60 ) + $tz_min ) * 60 ;
// Is timezone ahead of GMT? If yes, subtract offset.
if ( $tz_mod == '+' ) {
$offset_secs *= - 1 ;
}
$epoch += $offset_secs ;
}
return $epoch ;
}
else {
return FALSE ;
}
}