gl-website-deployer/vendor/renanbr/bibtex-parser/src/Parser.php
2024-11-23 21:32:22 +01:00

566 lines
17 KiB
PHP

<?php
/*
* This file is part of the BibTex Parser.
*
* (c) Renan de Lima Barbosa <renandelima@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace RenanBr\BibTexParser;
use ErrorException;
use RenanBr\BibTexParser\Exception\ParserException;
class Parser
{
const TYPE = 'type';
const CITATION_KEY = 'citation_key';
const TAG_NAME = 'tag_name';
const RAW_TAG_CONTENT = 'raw_tag_content';
const BRACED_TAG_CONTENT = 'braced_tag_content';
const QUOTED_TAG_CONTENT = 'quoted_tag_content';
const ENTRY = 'entry';
const NONE = 'none';
const COMMENT = 'comment';
const FIRST_TAG_NAME = 'first_tag_name';
const POST_TYPE = 'post_type';
const POST_TAG_NAME = 'post_tag_name';
const PRE_TAG_CONTENT = 'pre_tag_content';
/** @var string */
private $state;
/** @var string */
private $buffer;
/** @var int|null */
private $bufferOffset;
/** @var array|null */
private $firstTagSnapshot;
/** @var string|null */
private $originalEntryBuffer;
/** @var int|null */
private $originalEntryOffset;
/** @var bool */
private $skipOriginalEntryReading;
/** @var int */
private $line;
/** @var int */
private $column;
/** @var int */
private $offset;
/** @var bool */
private $isTagContentEscaped;
/** @var bool */
private $mayConcatenateTagContent;
/** @var string|null */
private $tagContentDelimiter;
/** @var int */
private $braceLevel;
/** @var ListenerInterface[] */
private $listeners = [];
public function addListener(ListenerInterface $listener)
{
$this->listeners[] = $listener;
}
/**
* @param string $file
*
* @throws ParserException if $file given is not a valid BibTeX
* @throws ErrorException if $file given is not readable
*/
public function parseFile($file)
{
$handle = @fopen($file, 'r');
if (!$handle) {
throw new ErrorException(sprintf('Unable to open %s', $file));
}
try {
$this->reset();
while (!feof($handle)) {
$buffer = fread($handle, 128);
$this->parse($buffer);
}
$this->throwExceptionIfReadingEntry("\0");
} finally {
fclose($handle);
}
}
/**
* @param string $string
*
* @throws ParserException if $string given is not a valid BibTeX
*/
public function parseString($string)
{
$this->reset();
$this->parse($string);
$this->throwExceptionIfReadingEntry("\0");
}
/**
* @param string $text
*/
private function parse($text)
{
$length = mb_strlen($text);
for ($position = 0; $position < $length; ++$position) {
$char = mb_substr($text, $position, 1);
$this->read($char);
if ("\n" === $char) {
++$this->line;
$this->column = 1;
} else {
++$this->column;
}
++$this->offset;
}
}
private function reset()
{
$this->state = self::NONE;
$this->buffer = '';
$this->firstTagSnapshot = null;
$this->originalEntryBuffer = null;
$this->originalEntryOffset = null;
$this->skipOriginalEntryReading = false;
$this->line = 1;
$this->column = 1;
$this->offset = 0;
$this->mayConcatenateTagContent = false;
$this->isTagContentEscaped = false;
$this->tagContentDelimiter = null;
$this->braceLevel = 0;
}
// ----- Readers -----------------------------------------------------------
/**
* @param string $char
*/
private function read($char)
{
$previousState = $this->state;
switch ($this->state) {
case self::NONE:
$this->readNone($char);
break;
case self::COMMENT:
$this->readComment($char);
break;
case self::TYPE:
$this->readType($char);
break;
case self::POST_TYPE:
$this->readPostType($char);
break;
case self::FIRST_TAG_NAME:
case self::TAG_NAME:
$this->readTagName($char);
break;
case self::POST_TAG_NAME:
$this->readPostTagName($char);
break;
case self::PRE_TAG_CONTENT:
$this->readPreTagContent($char);
break;
case self::RAW_TAG_CONTENT:
$this->readRawTagContent($char);
break;
case self::QUOTED_TAG_CONTENT:
case self::BRACED_TAG_CONTENT:
$this->readDelimitedTagContent($char);
break;
}
$this->readOriginalEntry($char, $previousState);
}
/**
* @param string $char
*/
private function readNone($char)
{
if ('@' === $char) {
$this->state = self::TYPE;
} elseif (!$this->isWhitespace($char)) {
$this->state = self::COMMENT;
}
}
/**
* @param string $char
*/
private function readComment($char)
{
if ($this->isWhitespace($char)) {
$this->state = self::NONE;
}
}
/**
* @param string $char
*/
private function readType($char)
{
if (preg_match('/^[a-zA-Z]$/', $char)) {
$this->appendToBuffer($char);
} else {
$this->throwExceptionIfBufferIsEmpty($char);
// Skips @comment type
if ('comment' === mb_strtolower($this->buffer)) {
$this->skipOriginalEntryReading = true;
$this->buffer = '';
$this->bufferOffset = null;
$this->state = self::COMMENT;
$this->readComment($char);
return;
}
$this->triggerListenersWithCurrentBuffer();
// once $char isn't a valid character
// it must be interpreted as POST_TYPE
$this->state = self::POST_TYPE;
$this->readPostType($char);
}
}
/**
* @param string $char
*/
private function readPostType($char)
{
if ('{' === $char) {
$this->state = self::FIRST_TAG_NAME;
} elseif (!$this->isWhitespace($char)) {
throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
}
}
/**
* @param string $char
*/
private function readTagName($char)
{
if (preg_match('/^[a-zA-Z0-9_\&\+:\-\.\/\x{00C0}-\x{01FF}]$/u', $char)) {
$this->appendToBuffer($char);
} elseif ($this->isWhitespace($char) && empty($this->buffer)) {
// Skips because we didn't start reading
} elseif ('}' === $char && empty($this->buffer)) {
// No tag name found, $char is just closing current entry
$this->state = self::NONE;
} else {
$this->throwExceptionIfBufferIsEmpty($char);
if (self::FIRST_TAG_NAME === $this->state) {
// Takes a snapshot of current state to be triggered later as
// tag name or citation key, see readPostTagName()
$this->firstTagSnapshot = $this->takeBufferSnapshot();
} else {
// Current buffer is a simple tag name
$this->triggerListenersWithCurrentBuffer();
}
// Once $char isn't a valid tag name character, it must be
// interpreted as post tag name
$this->state = self::POST_TAG_NAME;
$this->readPostTagName($char);
}
}
/**
* @param string $char
*/
private function readPostTagName($char)
{
if ('=' === $char) {
// First tag name isn't a citation key, because it has content
$this->triggerListenersWithFirstTagSnapshotAs(self::TAG_NAME);
$this->state = self::PRE_TAG_CONTENT;
} elseif ('}' === $char) {
// First tag name is a citation key, because $char closes entry and
// lets first tag without value
$this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY);
$this->state = self::NONE;
} elseif (',' === $char) {
// First tag name is a citation key, because $char moves to the next
// tag and lets first tag without value
$this->triggerListenersWithFirstTagSnapshotAs(self::CITATION_KEY);
$this->state = self::TAG_NAME;
} elseif (!$this->isWhitespace($char)) {
throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
}
}
/**
* @param string $char
*/
private function readPreTagContent($char)
{
if (preg_match('/^[a-zA-Z0-9]$/', $char)) {
// When concatenation is available it means there is already a
// defined value, and parser expect a concatenator, a tag separator
// or an entry closing char as next $char
$this->throwExceptionAccordingToConcatenationAvailability($char, true);
$this->state = self::RAW_TAG_CONTENT;
$this->readRawTagContent($char);
} elseif ('"' === $char) {
// The exception is here for the same reason of the first case
$this->throwExceptionAccordingToConcatenationAvailability($char, true);
$this->tagContentDelimiter = '"';
$this->state = self::QUOTED_TAG_CONTENT;
} elseif ('{' === $char) {
// The exception is here for the same reason of the first case
$this->throwExceptionAccordingToConcatenationAvailability($char, true);
$this->tagContentDelimiter = '}';
$this->state = self::BRACED_TAG_CONTENT;
} elseif ('#' === $char) {
$this->throwExceptionAccordingToConcatenationAvailability($char, false);
$this->mayConcatenateTagContent = false;
} elseif (',' === $char) {
$this->throwExceptionAccordingToConcatenationAvailability($char, false);
$this->mayConcatenateTagContent = false;
$this->state = self::TAG_NAME;
} elseif ('}' === $char) {
$this->throwExceptionAccordingToConcatenationAvailability($char, false);
$this->mayConcatenateTagContent = false;
$this->state = self::NONE;
} elseif (!$this->isWhitespace($char)) {
throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
}
}
/**
* @param string $char
*/
private function readRawTagContent($char)
{
if (preg_match('/^[a-zA-Z0-9_\+:\-\.\/]$/', $char)) {
$this->appendToBuffer($char);
} else {
$this->throwExceptionIfBufferIsEmpty($char);
$this->triggerListenersWithCurrentBuffer();
// once $char isn't a valid character
// it must be interpreted as TAG_CONTENT
$this->mayConcatenateTagContent = true;
$this->state = self::PRE_TAG_CONTENT;
$this->readPreTagContent($char);
}
}
/**
* @param string $char
*/
private function readDelimitedTagContent($char)
{
if ($this->isTagContentEscaped) {
$this->isTagContentEscaped = false;
if ($this->tagContentDelimiter !== $char && '\\' !== $char && '%' !== $char) {
$this->appendToBuffer('\\');
}
$this->appendToBuffer($char);
} elseif ('}' === $this->tagContentDelimiter && '{' === $char) {
++$this->braceLevel;
$this->appendToBuffer($char);
} elseif ($this->tagContentDelimiter === $char) {
if (0 === $this->braceLevel) {
$this->triggerListenersWithCurrentBuffer();
$this->mayConcatenateTagContent = true;
$this->state = self::PRE_TAG_CONTENT;
} else {
--$this->braceLevel;
$this->appendToBuffer($char);
}
} elseif ('\\' === $char) {
$this->isTagContentEscaped = true;
} else {
$this->appendToBuffer($char);
}
}
/**
* @param string $char
* @param string $previousState
*/
private function readOriginalEntry($char, $previousState)
{
if ($this->skipOriginalEntryReading) {
$this->originalEntryBuffer = '';
$this->originalEntryOffset = null;
$this->skipOriginalEntryReading = false;
return;
}
// Checks whether we are reading an entry character or not
$isPreviousStateEntry = $this->isEntryState($previousState);
$isCurrentStateEntry = $this->isEntryState($this->state);
$isEntry = $isPreviousStateEntry || $isCurrentStateEntry;
if (!$isEntry) {
return;
}
// Appends $char to the original entry buffer
if (empty($this->originalEntryBuffer)) {
$this->originalEntryOffset = $this->offset;
}
$this->originalEntryBuffer .= $char;
// Sends original entry to the listeners when $char closes an entry
$isClosingEntry = $isPreviousStateEntry && !$isCurrentStateEntry;
if ($isClosingEntry) {
$this->triggerListeners($this->originalEntryBuffer, self::ENTRY, [
'offset' => $this->originalEntryOffset,
'length' => $this->offset - $this->originalEntryOffset + 1,
]);
$this->originalEntryBuffer = '';
$this->originalEntryOffset = null;
}
}
// ----- Listener triggers -------------------------------------------------
/**
* @param string $text
* @param string $type
*/
private function triggerListeners($text, $type, array $context)
{
foreach ($this->listeners as $listener) {
$listener->bibTexUnitFound($text, $type, $context);
}
}
private function triggerListenersWithCurrentBuffer()
{
$snapshot = $this->takeBufferSnapshot();
$text = $snapshot['text'];
$context = $snapshot['context'];
$this->triggerListeners($text, $this->state, $context);
}
/**
* @param string $type
*/
private function triggerListenersWithFirstTagSnapshotAs($type)
{
if (empty($this->firstTagSnapshot)) {
return;
}
$text = $this->firstTagSnapshot['text'];
$context = $this->firstTagSnapshot['context'];
$this->firstTagSnapshot = null;
$this->triggerListeners($text, $type, $context);
}
// ----- Buffer tools ------------------------------------------------------
/**
* @param string $char
*/
private function appendToBuffer($char)
{
if (empty($this->buffer)) {
$this->bufferOffset = $this->offset;
}
$this->buffer .= $char;
}
/**
* @return array
*/
private function takeBufferSnapshot()
{
$snapshot = [
'text' => $this->buffer,
'context' => [
'offset' => $this->bufferOffset,
'length' => $this->offset - $this->bufferOffset,
],
];
$this->bufferOffset = null;
$this->buffer = '';
return $snapshot;
}
// ----- Exception throwers ------------------------------------------------
/**
* @param string $char
* @param bool $availability
*/
private function throwExceptionAccordingToConcatenationAvailability($char, $availability)
{
if ($availability === $this->mayConcatenateTagContent) {
throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
}
}
/**
* @param string $char
*/
private function throwExceptionIfBufferIsEmpty($char)
{
if (empty($this->buffer)) {
throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
}
}
/**
* @param string $char
*/
private function throwExceptionIfReadingEntry($char)
{
if ($this->isEntryState($this->state)) {
throw ParserException::unexpectedCharacter($char, $this->line, $this->column);
}
}
// ----- Auxiliaries -------------------------------------------------------
/**
* @param string $state
*
* @return bool
*/
private function isEntryState($state)
{
return self::NONE !== $state && self::COMMENT !== $state;
}
/**
* @param string $char
*
* @return bool
*/
private function isWhitespace($char)
{
return ' ' === $char || "\t" === $char || "\n" === $char || "\r" === $char;
}
}