westdc-zf1/include/LibRIS/RISReader.php

<?php
/**
 * This is a library for parsing RIS files.
 *
 * LibRIS::RISReader() is the main parser.
 * LibRIS::RISWriter() can generate RIS data.
 * LibRIS::RISTags() contains useful RIS information.
 *
 * @see http://www.refman.com/support/risformat_intro.asp
 */

namespace LibRIS;

/**
 * The main class for parsing RIS files.
 *
 * Usage:
 * @code
 * <?php
 *
 * use \LibRIS\RISReader;
 *
 * $reader = new RISReader();
 *
 * // Parse a file of RIS data.
 * $reader->parseFile('path/to/file.ris');
 *
 * // Parse a string containing RIS data.
 * $reader->parseString($someRisString);
 *
 * // Parse an array of lines.
 * $reader->parseArray($arrayOfRISDirectives);
 *
 * // Get an associative array of records.
 * $array = $reader->getRecords();
 *
 * // Dump the records to STDOUT
 * $reader->printRecords();
 *
 * ?>
 * @endcode
 *
 * The data structure generated by this class is of the form
 * @code
 * <?php
 * array(
 *   [0] => array(
 *     'T1' => array('title one', 'title 2'),
 *     'TY' => array('JOUR'),
 *     // Other tags and their values.
 *   ),
 *   [1] => array(
 *     'T1' => array('another entry'),
 *     'TY' => array('JOUR'),
 *   ),
 * );
 * ?>
 * @endcode
 */
class RISReader {

  const RIS_EOL = "\r\n";
  const LINE_REGEX = '/^(([A-Z1-9]{2})\s+-(.*))|(.*)$/';

  protected $data = NULL;

  public function __construct($options = array()) {

  }

  /**
   * Parse an RIS file.
   *
   * This will parse the file and return a data structure representing the
   * record.
   *
   * @param string $filename
   *  The full path to the file to parse.
   * @param StreamContext $context
   *  The stream context (in desired) for handling the file.
   * @retval array
   *  An indexed array of individual sources, each of which is an
   *  associative array of entry details. (See LibRIS)
   */
  public function parseFile($filename, $context = NULL) {
    if (!is_file($filename)) {
      throw new ParseException(sprintf('File %s not found.', htmlentities($filename)));
    }
    $flags = FILE_SKIP_EMPTY_LINES | FILE_TEXT;
    $contents = file($filename, $flags, $context);

    $this->parseArray($contents);
  }

  /**
   * Parse a string of RIS data.
   *
   * This will parse an RIS record into a representative data structure.
   *
   * @param string $string
   *  RIS-formatted data in a string.
   * @param StreamContext $context
   *  The stream context (in desired) for handling the file.
   * @retval array
   *  An indexed array of individual sources, each of which is an
   *  associative array of entry details. (See {@link LibRIS})
   */
  public function parseString($string) {
    $contents = explode (RISReader::RIS_EOL, $string);
    $this->parseArray($contents);
  }

  /**
   * Take an array of lines and parse them into an RIS record.
   */
  protected function parseArray($lines) {
    $recordset = array();

    // Do any cleaning and normalizing.
    $this->cleanData($lines);

    $record = array();
    $lastTag = NULL;
    foreach ($lines as $line) {
      $line = trim($line);
      $matches = array();

      preg_match(self::LINE_REGEX, $line, $matches);
      if (!empty($matches[3])) {
        $lastTag = $matches[2];
        $record[$matches[2]][] = trim($matches[3]);
      }
      // End record and prep a new one.
      elseif (!empty($matches[2]) && $matches[2] == 'ER') {
        $lastTag = NULL;
        $recordset[] = $record;
        $record = array();
      }
      elseif (!empty($matches[4])) {
        // Append to the last one.
        // We skip leading info (like BOMs).
        if (!empty($lastTag)) {
          $lastEntry = count($record[$lastTag]) - 1;
          // We trim because some encoders add tabs or multiple spaces.
          // Standard is silent on how this should be handled.
          $record[$lastTag][$lastEntry] .= ' ' . trim($matches[4]);
        }
      }
    }
    if (!empty($record)) $recordset[] = $record;

    $this->data = $recordset;
  }

  public function getRecords() {
    return $this->data;
  }

  public function printRecords() {
    $format = "%s:\n\t%s\n";
    foreach ($this->data as $record) {
      foreach ($record as $key => $values) {
        foreach ($values as $value) {
          printf($format, RISTags::describeTag($key), $value);
        }
      }

      print PHP_EOL;
    }
  }

  /**
   * Clean up the data before processing.
   *
   * @param array $lines
   *   Indexed array of lines of data.
   */
  protected function cleanData(&$lines) {

    if (empty($lines)) return;

    // Currently, we only need to strip a BOM if it exists.
    // Thanks to Derik Badman (http://madinkbeard.com/) for finding the
    // bug and suggesting this fix:
    // http://blog.philipp-michels.de/?p=32
    $first = $lines[0];
    if (substr($first, 0, 3) == pack('CCC', 0xef, 0xbb, 0xbf)) {
      $lines[0] = substr($first, 3);
    }
  }

}