westdc-zf1/include/LibRIS/RISReader.php

<?php
/**
 * This is a library for parsing RIS files.
 *
 * LibRIS::RISReader() is the main parser.
 * LibRIS::RISWriter() can generate RIS data.
 * LibRIS::RISTags() contains useful RIS information.
 *
 * @see http://www.refman.com/support/risformat_intro.asp
 */

namespace LibRIS;

/**
 * The main class for parsing RIS files.
 *
 * Usage:
 * @code
 * <?php
 *
 * use \LibRIS\RISReader;
 *
 * $reader = new RISReader();
 *
 * // Parse a file of RIS data.
 * $reader->parseFile('path/to/file.ris');
 *
 * // Parse a string containing RIS data.
 * $reader->parseString($someRisString);
 *
 * // Parse an array of lines.
 * $reader->parseArray($arrayOfRISDirectives);
 *
 * // Get an associative array of records.
 * $array = $reader->getRecords();
 *
 * // Dump the records to STDOUT
 * $reader->printRecords();
 *
 * ?>
 * @endcode
 *
 * The data structure generated by this class is of the form
 * @code
 * <?php
 * array(
 *   [0] => array(
 *     'T1' => array('title one', 'title 2'),
 *     'TY' => array('JOUR'),
 *     // Other tags and their values.
 *   ),
 *   [1] => array(
 *     'T1' => array('another entry'),
 *     'TY' => array('JOUR'),
 *   ),
 * );
 * ?>
 * @endcode
 */
class RISReader {

  const RIS_EOL = "\r\n";
  const LINE_REGEX = '/^(([A-Z1-9]{2})\s+-(.*))|(.*)$/';

  protected $data = NULL;

  public function __construct($options = array()) {

  }

  /**
   * Parse an RIS file.
   *
   * This will parse the file and return a data structure representing the
   * record.
   *
   * @param string $filename
   *  The full path to the file to parse.
   * @param StreamContext $context
   *  The stream context (in desired) for handling the file.
   * @retval array
   *  An indexed array of individual sources, each of which is an 
   *  associative array of entry details. (See LibRIS)
   */
  public function parseFile($filename, $context = NULL) {
    if (!is_file($filename)) {
      throw new ParseException(sprintf('File %s not found.', htmlentities($filename)));
    }
    $flags = FILE_SKIP_EMPTY_LINES | FILE_TEXT;
    $contents = file($filename, $flags, $context);

    $this->parseArray($contents);
  }

  /**
   * Parse a string of RIS data.
   *
   * This will parse an RIS record into a representative data structure.
   *
   * @param string $string
   *  RIS-formatted data in a string.
   * @param StreamContext $context
   *  The stream context (in desired) for handling the file.
   * @retval array
   *  An indexed array of individual sources, each of which is an 
   *  associative array of entry details. (See {@link LibRIS})
   */
  public function parseString($string) {
    $contents = explode (RISReader::RIS_EOL, $string);
    $this->parseArray($contents);
  }

  /**
   * Take an array of lines and parse them into an RIS record.
   */
  protected function parseArray($lines) {
    $recordset = array();

    // Do any cleaning and normalizing.
    $this->cleanData($lines);

    $record = array();
    $lastTag = NULL;
    foreach ($lines as $line) {
      $line = trim($line);
      $matches = array();

      preg_match(self::LINE_REGEX, $line, $matches);
      if (!empty($matches[3])) {
        $lastTag = $matches[2];
        $record[$matches[2]][] = trim($matches[3]);
      }
      // End record and prep a new one.
      elseif (!empty($matches[2]) && $matches[2] == 'ER') {
        $lastTag = NULL;
        $recordset[] = $record;
        $record = array();
      }
      elseif (!empty($matches[4])) {
        // Append to the last one.
        // We skip leading info (like BOMs).
        if (!empty($lastTag)) {
          $lastEntry = count($record[$lastTag]) - 1;
          // We trim because some encoders add tabs or multiple spaces.
          // Standard is silent on how this should be handled.
          $record[$lastTag][$lastEntry] .= ' ' . trim($matches[4]);
        }
      }
    }
    if (!empty($record)) $recordset[] = $record;

    $this->data = $recordset;
  }

  public function getRecords() {
    return $this->data;
  }

  public function printRecords() {
    $format = "%s:\n\t%s\n";
    foreach ($this->data as $record) {
      foreach ($record as $key => $values) {
        foreach ($values as $value) {
          printf($format, RISTags::describeTag($key), $value);
        }
      }

      print PHP_EOL;
    }
  }

  /**
   * Clean up the data before processing.
   *
   * @param array $lines
   *   Indexed array of lines of data.
   */
  protected function cleanData(&$lines) {

    if (empty($lines)) return;

    // Currently, we only need to strip a BOM if it exists.
    // Thanks to Derik Badman (http://madinkbeard.com/) for finding the
    // bug and suggesting this fix:
    // http://blog.philipp-michels.de/?p=32
    $first = $lines[0];
    if (substr($first, 0, 3) == pack('CCC', 0xef, 0xbb, 0xbf)) {
      $lines[0] = substr($first, 3);
    }
  }

}
merge from heihedata branch to r4013. 2013-10-28 14:54:21 +00:00			`<?php`
			`/**`
			`* This is a library for parsing RIS files.`
			`*`
			`* LibRIS::RISReader() is the main parser.`
			`* LibRIS::RISWriter() can generate RIS data.`
			`* LibRIS::RISTags() contains useful RIS information.`
			`*`
			`* @see http://www.refman.com/support/risformat_intro.asp`
			`*/`

			`namespace LibRIS;`

			`/**`
			`* The main class for parsing RIS files.`
			`*`
			`* Usage:`
			`* @code`
			`* <?php`
			`*`
			`* use \LibRIS\RISReader;`
			`*`
			`* $reader = new RISReader();`
			`*`
			`* // Parse a file of RIS data.`
			`* $reader->parseFile('path/to/file.ris');`
			`*`
			`* // Parse a string containing RIS data.`
			`* $reader->parseString($someRisString);`
			`*`
			`* // Parse an array of lines.`
			`* $reader->parseArray($arrayOfRISDirectives);`
			`*`
			`* // Get an associative array of records.`
			`* $array = $reader->getRecords();`
			`*`
			`* // Dump the records to STDOUT`
			`* $reader->printRecords();`
			`*`
			`* ?>`
			`* @endcode`
			`*`
			`* The data structure generated by this class is of the form`
			`* @code`
			`* <?php`
			`* array(`
			`* [0] => array(`
			`* 'T1' => array('title one', 'title 2'),`
			`* 'TY' => array('JOUR'),`
			`* // Other tags and their values.`
			`* ),`
			`* [1] => array(`
			`* 'T1' => array('another entry'),`
			`* 'TY' => array('JOUR'),`
			`* ),`
			`* );`
			`* ?>`
			`* @endcode`
			`*/`
			`class RISReader {`

			`const RIS_EOL = "\r\n";`
			`const LINE_REGEX = '/^(([A-Z1-9]{2})\s+-(.))\|(.)$/';`

			`protected $data = NULL;`

			`public function __construct($options = array()) {`

			`}`

			`/**`
			`* Parse an RIS file.`
			`*`
			`* This will parse the file and return a data structure representing the`
			`* record.`
			`*`
			`* @param string $filename`
			`* The full path to the file to parse.`
			`* @param StreamContext $context`
			`* The stream context (in desired) for handling the file.`
			`* @retval array`
			`* An indexed array of individual sources, each of which is an`
			`* associative array of entry details. (See LibRIS)`
			`*/`
			`public function parseFile($filename, $context = NULL) {`
			`if (!is_file($filename)) {`
			`throw new ParseException(sprintf('File %s not found.', htmlentities($filename)));`
			`}`
			`$flags = FILE_SKIP_EMPTY_LINES \| FILE_TEXT;`
			`$contents = file($filename, $flags, $context);`

			`$this->parseArray($contents);`
			`}`

			`/**`
			`* Parse a string of RIS data.`
			`*`
			`* This will parse an RIS record into a representative data structure.`
			`*`
			`* @param string $string`
			`* RIS-formatted data in a string.`
			`* @param StreamContext $context`
			`* The stream context (in desired) for handling the file.`
			`* @retval array`
			`* An indexed array of individual sources, each of which is an`
			`* associative array of entry details. (See {@link LibRIS})`
			`*/`
			`public function parseString($string) {`
			`$contents = explode (RISReader::RIS_EOL, $string);`
			`$this->parseArray($contents);`
			`}`

			`/**`
			`* Take an array of lines and parse them into an RIS record.`
			`*/`
			`protected function parseArray($lines) {`
			`$recordset = array();`

			`// Do any cleaning and normalizing.`
			`$this->cleanData($lines);`

			`$record = array();`
			`$lastTag = NULL;`
			`foreach ($lines as $line) {`
			`$line = trim($line);`
			`$matches = array();`

			`preg_match(self::LINE_REGEX, $line, $matches);`
			`if (!empty($matches[3])) {`
			`$lastTag = $matches[2];`
			`$record[$matches[2]][] = trim($matches[3]);`
			`}`
			`// End record and prep a new one.`
			`elseif (!empty($matches[2]) && $matches[2] == 'ER') {`
			`$lastTag = NULL;`
			`$recordset[] = $record;`
			`$record = array();`
			`}`
			`elseif (!empty($matches[4])) {`
			`// Append to the last one.`
			`// We skip leading info (like BOMs).`
			`if (!empty($lastTag)) {`
			`$lastEntry = count($record[$lastTag]) - 1;`
			`// We trim because some encoders add tabs or multiple spaces.`
			`// Standard is silent on how this should be handled.`
			`$record[$lastTag][$lastEntry] .= ' ' . trim($matches[4]);`
			`}`
			`}`
			`}`
			`if (!empty($record)) $recordset[] = $record;`

			`$this->data = $recordset;`
			`}`

			`public function getRecords() {`
			`return $this->data;`
			`}`

			`public function printRecords() {`
			`$format = "%s:\n\t%s\n";`
			`foreach ($this->data as $record) {`
			`foreach ($record as $key => $values) {`
			`foreach ($values as $value) {`
			`printf($format, RISTags::describeTag($key), $value);`
			`}`
			`}`

			`print PHP_EOL;`
			`}`
			`}`

			`/**`
			`* Clean up the data before processing.`
			`*`
			`* @param array $lines`
			`* Indexed array of lines of data.`
			`*/`
			`protected function cleanData(&$lines) {`

			`if (empty($lines)) return;`

			`// Currently, we only need to strip a BOM if it exists.`
			`// Thanks to Derik Badman (http://madinkbeard.com/) for finding the`
			`// bug and suggesting this fix:`
			`// http://blog.philipp-michels.de/?p=32`
			`$first = $lines[0];`
			`if (substr($first, 0, 3) == pack('CCC', 0xef, 0xbb, 0xbf)) {`
			`$lines[0] = substr($first, 3);`
			`}`
			`}`

			`}`