193 lines
4.7 KiB
PHP
193 lines
4.7 KiB
PHP
|
<?php
|
||
|
/**
|
||
|
* This is a library for parsing RIS files.
|
||
|
*
|
||
|
* LibRIS::RISReader() is the main parser.
|
||
|
* LibRIS::RISWriter() can generate RIS data.
|
||
|
* LibRIS::RISTags() contains useful RIS information.
|
||
|
*
|
||
|
* @see http://www.refman.com/support/risformat_intro.asp
|
||
|
*/
|
||
|
|
||
|
namespace LibRIS;
|
||
|
|
||
|
/**
|
||
|
* The main class for parsing RIS files.
|
||
|
*
|
||
|
* Usage:
|
||
|
* @code
|
||
|
* <?php
|
||
|
*
|
||
|
* use \LibRIS\RISReader;
|
||
|
*
|
||
|
* $reader = new RISReader();
|
||
|
*
|
||
|
* // Parse a file of RIS data.
|
||
|
* $reader->parseFile('path/to/file.ris');
|
||
|
*
|
||
|
* // Parse a string containing RIS data.
|
||
|
* $reader->parseString($someRisString);
|
||
|
*
|
||
|
* // Parse an array of lines.
|
||
|
* $reader->parseArray($arrayOfRISDirectives);
|
||
|
*
|
||
|
* // Get an associative array of records.
|
||
|
* $array = $reader->getRecords();
|
||
|
*
|
||
|
* // Dump the records to STDOUT
|
||
|
* $reader->printRecords();
|
||
|
*
|
||
|
* ?>
|
||
|
* @endcode
|
||
|
*
|
||
|
* The data structure generated by this class is of the form
|
||
|
* @code
|
||
|
* <?php
|
||
|
* array(
|
||
|
* [0] => array(
|
||
|
* 'T1' => array('title one', 'title 2'),
|
||
|
* 'TY' => array('JOUR'),
|
||
|
* // Other tags and their values.
|
||
|
* ),
|
||
|
* [1] => array(
|
||
|
* 'T1' => array('another entry'),
|
||
|
* 'TY' => array('JOUR'),
|
||
|
* ),
|
||
|
* );
|
||
|
* ?>
|
||
|
* @endcode
|
||
|
*/
|
||
|
class RISReader {
|
||
|
|
||
|
const RIS_EOL = "\r\n";
|
||
|
const LINE_REGEX = '/^(([A-Z1-9]{2})\s+-(.*))|(.*)$/';
|
||
|
|
||
|
protected $data = NULL;
|
||
|
|
||
|
public function __construct($options = array()) {
|
||
|
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse an RIS file.
|
||
|
*
|
||
|
* This will parse the file and return a data structure representing the
|
||
|
* record.
|
||
|
*
|
||
|
* @param string $filename
|
||
|
* The full path to the file to parse.
|
||
|
* @param StreamContext $context
|
||
|
* The stream context (in desired) for handling the file.
|
||
|
* @retval array
|
||
|
* An indexed array of individual sources, each of which is an
|
||
|
* associative array of entry details. (See LibRIS)
|
||
|
*/
|
||
|
public function parseFile($filename, $context = NULL) {
|
||
|
if (!is_file($filename)) {
|
||
|
throw new ParseException(sprintf('File %s not found.', htmlentities($filename)));
|
||
|
}
|
||
|
$flags = FILE_SKIP_EMPTY_LINES | FILE_TEXT;
|
||
|
$contents = file($filename, $flags, $context);
|
||
|
|
||
|
$this->parseArray($contents);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse a string of RIS data.
|
||
|
*
|
||
|
* This will parse an RIS record into a representative data structure.
|
||
|
*
|
||
|
* @param string $string
|
||
|
* RIS-formatted data in a string.
|
||
|
* @param StreamContext $context
|
||
|
* The stream context (in desired) for handling the file.
|
||
|
* @retval array
|
||
|
* An indexed array of individual sources, each of which is an
|
||
|
* associative array of entry details. (See {@link LibRIS})
|
||
|
*/
|
||
|
public function parseString($string) {
|
||
|
$contents = explode (RISReader::RIS_EOL, $string);
|
||
|
$this->parseArray($contents);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Take an array of lines and parse them into an RIS record.
|
||
|
*/
|
||
|
protected function parseArray($lines) {
|
||
|
$recordset = array();
|
||
|
|
||
|
// Do any cleaning and normalizing.
|
||
|
$this->cleanData($lines);
|
||
|
|
||
|
$record = array();
|
||
|
$lastTag = NULL;
|
||
|
foreach ($lines as $line) {
|
||
|
$line = trim($line);
|
||
|
$matches = array();
|
||
|
|
||
|
preg_match(self::LINE_REGEX, $line, $matches);
|
||
|
if (!empty($matches[3])) {
|
||
|
$lastTag = $matches[2];
|
||
|
$record[$matches[2]][] = trim($matches[3]);
|
||
|
}
|
||
|
// End record and prep a new one.
|
||
|
elseif (!empty($matches[2]) && $matches[2] == 'ER') {
|
||
|
$lastTag = NULL;
|
||
|
$recordset[] = $record;
|
||
|
$record = array();
|
||
|
}
|
||
|
elseif (!empty($matches[4])) {
|
||
|
// Append to the last one.
|
||
|
// We skip leading info (like BOMs).
|
||
|
if (!empty($lastTag)) {
|
||
|
$lastEntry = count($record[$lastTag]) - 1;
|
||
|
// We trim because some encoders add tabs or multiple spaces.
|
||
|
// Standard is silent on how this should be handled.
|
||
|
$record[$lastTag][$lastEntry] .= ' ' . trim($matches[4]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (!empty($record)) $recordset[] = $record;
|
||
|
|
||
|
$this->data = $recordset;
|
||
|
}
|
||
|
|
||
|
public function getRecords() {
|
||
|
return $this->data;
|
||
|
}
|
||
|
|
||
|
public function printRecords() {
|
||
|
$format = "%s:\n\t%s\n";
|
||
|
foreach ($this->data as $record) {
|
||
|
foreach ($record as $key => $values) {
|
||
|
foreach ($values as $value) {
|
||
|
printf($format, RISTags::describeTag($key), $value);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
print PHP_EOL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Clean up the data before processing.
|
||
|
*
|
||
|
* @param array $lines
|
||
|
* Indexed array of lines of data.
|
||
|
*/
|
||
|
protected function cleanData(&$lines) {
|
||
|
|
||
|
if (empty($lines)) return;
|
||
|
|
||
|
// Currently, we only need to strip a BOM if it exists.
|
||
|
// Thanks to Derik Badman (http://madinkbeard.com/) for finding the
|
||
|
// bug and suggesting this fix:
|
||
|
// http://blog.philipp-michels.de/?p=32
|
||
|
$first = $lines[0];
|
||
|
if (substr($first, 0, 3) == pack('CCC', 0xef, 0xbb, 0xbf)) {
|
||
|
$lines[0] = substr($first, 3);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|