westdc-zf1/include/LibRIS/RISReader.php

193 lines
4.7 KiB
PHP
Raw Normal View History

2013-10-28 14:54:21 +00:00
<?php
/**
* This is a library for parsing RIS files.
*
* LibRIS::RISReader() is the main parser.
* LibRIS::RISWriter() can generate RIS data.
* LibRIS::RISTags() contains useful RIS information.
*
* @see http://www.refman.com/support/risformat_intro.asp
*/
namespace LibRIS;
/**
* The main class for parsing RIS files.
*
* Usage:
* @code
* <?php
*
* use \LibRIS\RISReader;
*
* $reader = new RISReader();
*
* // Parse a file of RIS data.
* $reader->parseFile('path/to/file.ris');
*
* // Parse a string containing RIS data.
* $reader->parseString($someRisString);
*
* // Parse an array of lines.
* $reader->parseArray($arrayOfRISDirectives);
*
* // Get an associative array of records.
* $array = $reader->getRecords();
*
* // Dump the records to STDOUT
* $reader->printRecords();
*
* ?>
* @endcode
*
* The data structure generated by this class is of the form
* @code
* <?php
* array(
* [0] => array(
* 'T1' => array('title one', 'title 2'),
* 'TY' => array('JOUR'),
* // Other tags and their values.
* ),
* [1] => array(
* 'T1' => array('another entry'),
* 'TY' => array('JOUR'),
* ),
* );
* ?>
* @endcode
*/
class RISReader {
const RIS_EOL = "\r\n";
const LINE_REGEX = '/^(([A-Z1-9]{2})\s+-(.*))|(.*)$/';
protected $data = NULL;
public function __construct($options = array()) {
}
/**
* Parse an RIS file.
*
* This will parse the file and return a data structure representing the
* record.
*
* @param string $filename
* The full path to the file to parse.
* @param StreamContext $context
* The stream context (in desired) for handling the file.
* @retval array
* An indexed array of individual sources, each of which is an
* associative array of entry details. (See LibRIS)
*/
public function parseFile($filename, $context = NULL) {
if (!is_file($filename)) {
throw new ParseException(sprintf('File %s not found.', htmlentities($filename)));
}
$flags = FILE_SKIP_EMPTY_LINES | FILE_TEXT;
$contents = file($filename, $flags, $context);
$this->parseArray($contents);
}
/**
* Parse a string of RIS data.
*
* This will parse an RIS record into a representative data structure.
*
* @param string $string
* RIS-formatted data in a string.
* @param StreamContext $context
* The stream context (in desired) for handling the file.
* @retval array
* An indexed array of individual sources, each of which is an
* associative array of entry details. (See {@link LibRIS})
*/
public function parseString($string) {
$contents = explode (RISReader::RIS_EOL, $string);
$this->parseArray($contents);
}
/**
* Take an array of lines and parse them into an RIS record.
*/
protected function parseArray($lines) {
$recordset = array();
// Do any cleaning and normalizing.
$this->cleanData($lines);
$record = array();
$lastTag = NULL;
foreach ($lines as $line) {
$line = trim($line);
$matches = array();
preg_match(self::LINE_REGEX, $line, $matches);
if (!empty($matches[3])) {
$lastTag = $matches[2];
$record[$matches[2]][] = trim($matches[3]);
}
// End record and prep a new one.
elseif (!empty($matches[2]) && $matches[2] == 'ER') {
$lastTag = NULL;
$recordset[] = $record;
$record = array();
}
elseif (!empty($matches[4])) {
// Append to the last one.
// We skip leading info (like BOMs).
if (!empty($lastTag)) {
$lastEntry = count($record[$lastTag]) - 1;
// We trim because some encoders add tabs or multiple spaces.
// Standard is silent on how this should be handled.
$record[$lastTag][$lastEntry] .= ' ' . trim($matches[4]);
}
}
}
if (!empty($record)) $recordset[] = $record;
$this->data = $recordset;
}
public function getRecords() {
return $this->data;
}
public function printRecords() {
$format = "%s:\n\t%s\n";
foreach ($this->data as $record) {
foreach ($record as $key => $values) {
foreach ($values as $value) {
printf($format, RISTags::describeTag($key), $value);
}
}
print PHP_EOL;
}
}
/**
* Clean up the data before processing.
*
* @param array $lines
* Indexed array of lines of data.
*/
protected function cleanData(&$lines) {
if (empty($lines)) return;
// Currently, we only need to strip a BOM if it exists.
// Thanks to Derik Badman (http://madinkbeard.com/) for finding the
// bug and suggesting this fix:
// http://blog.philipp-michels.de/?p=32
$first = $lines[0];
if (substr($first, 0, 3) == pack('CCC', 0xef, 0xbb, 0xbf)) {
$lines[0] = substr($first, 3);
}
}
}