* * For the full copyright and license information, please view the LICENSE * file that was distributed with this source code. */ namespace Behat\Gherkin; use Behat\Gherkin\Exception\LexerException; use Behat\Gherkin\Keywords\KeywordsInterface; /** * Gherkin lexer. * * @author Konstantin Kudryashov */ class Lexer { private $language; private $lines; private $linesCount; private $line; private $trimmedLine; private $lineNumber; private $eos; private $keywords; private $keywordsCache = array(); private $stepKeywordTypesCache = array(); private $deferredObjects = array(); private $deferredObjectsCount = 0; private $stashedToken; private $inPyString = false; private $pyStringSwallow = 0; private $featureStarted = false; private $allowMultilineArguments = false; private $allowSteps = false; /** * Initializes lexer. * * @param KeywordsInterface $keywords Keywords holder */ public function __construct(KeywordsInterface $keywords) { $this->keywords = $keywords; } /** * Sets lexer input. * * @param string $input Input string * @param string $language Language name * * @throws Exception\LexerException */ public function analyse($input, $language = 'en') { // try to detect unsupported encoding if ('UTF-8' !== mb_detect_encoding($input, 'UTF-8', true)) { throw new LexerException('Feature file is not in UTF8 encoding'); } $input = strtr($input, array("\r\n" => "\n", "\r" => "\n")); $this->lines = explode("\n", $input); $this->linesCount = count($this->lines); $this->line = $this->lines[0]; $this->lineNumber = 1; $this->trimmedLine = null; $this->eos = false; $this->deferredObjects = array(); $this->deferredObjectsCount = 0; $this->stashedToken = null; $this->inPyString = false; $this->pyStringSwallow = 0; $this->featureStarted = false; $this->allowMultilineArguments = false; $this->allowSteps = false; $this->keywords->setLanguage($this->language = $language); $this->keywordsCache = array(); $this->stepKeywordTypesCache = array(); } /** * Returns current lexer language. * * @return string */ public function getLanguage() { return $this->language; } /** * Returns next token or previously stashed one. * * @return array */ public function getAdvancedToken() { return $this->getStashedToken() ?: $this->getNextToken(); } /** * Defers token. * * @param array $token Token to defer */ public function deferToken(array $token) { $token['deferred'] = true; $this->deferredObjects[] = $token; ++$this->deferredObjectsCount; } /** * Predicts for number of tokens. * * @return array */ public function predictToken() { if (null === $this->stashedToken) { $this->stashedToken = $this->getNextToken(); } return $this->stashedToken; } /** * Constructs token with specified parameters. * * @param string $type Token type * @param string $value Token value * * @return array */ public function takeToken($type, $value = null) { return array( 'type' => $type, 'line' => $this->lineNumber, 'value' => $value ?: null, 'deferred' => false ); } /** * Consumes line from input & increments line counter. */ protected function consumeLine() { ++$this->lineNumber; if (($this->lineNumber - 1) === $this->linesCount) { $this->eos = true; return; } $this->line = $this->lines[$this->lineNumber - 1]; $this->trimmedLine = null; } /** * Returns trimmed version of line. * * @return string */ protected function getTrimmedLine() { return null !== $this->trimmedLine ? $this->trimmedLine : $this->trimmedLine = trim($this->line); } /** * Returns stashed token or null if hasn't. * * @return array|null */ protected function getStashedToken() { $stashedToken = $this->stashedToken; $this->stashedToken = null; return $stashedToken; } /** * Returns deferred token or null if hasn't. * * @return array|null */ protected function getDeferredToken() { if (!$this->deferredObjectsCount) { return null; } --$this->deferredObjectsCount; return array_shift($this->deferredObjects); } /** * Returns next token from input. * * @return array */ protected function getNextToken() { return $this->getDeferredToken() ?: $this->scanEOS() ?: $this->scanLanguage() ?: $this->scanComment() ?: $this->scanPyStringOp() ?: $this->scanPyStringContent() ?: $this->scanStep() ?: $this->scanScenario() ?: $this->scanBackground() ?: $this->scanOutline() ?: $this->scanExamples() ?: $this->scanFeature() ?: $this->scanTags() ?: $this->scanTableRow() ?: $this->scanNewline() ?: $this->scanText(); } /** * Scans for token with specified regex. * * @param string $regex Regular expression * @param string $type Expected token type * * @return null|array */ protected function scanInput($regex, $type) { if (!preg_match($regex, $this->line, $matches)) { return null; } $token = $this->takeToken($type, $matches[1]); $this->consumeLine(); return $token; } /** * Scans for token with specified keywords. * * @param string $keywords Keywords (splitted with |) * @param string $type Expected token type * * @return null|array */ protected function scanInputForKeywords($keywords, $type) { if (!preg_match('/^(\s*)(' . $keywords . '):\s*(.*)/u', $this->line, $matches)) { return null; } $token = $this->takeToken($type, $matches[3]); $token['keyword'] = $matches[2]; $token['indent'] = mb_strlen($matches[1], 'utf8'); $this->consumeLine(); // turn off language searching if ('Feature' === $type) { $this->featureStarted = true; } // turn off PyString and Table searching if ('Feature' === $type || 'Scenario' === $type || 'Outline' === $type) { $this->allowMultilineArguments = false; } elseif ('Examples' === $type) { $this->allowMultilineArguments = true; } // turn on steps searching if ('Scenario' === $type || 'Background' === $type || 'Outline' === $type) { $this->allowSteps = true; } return $token; } /** * Scans EOS from input & returns it if found. * * @return null|array */ protected function scanEOS() { if (!$this->eos) { return null; } return $this->takeToken('EOS'); } /** * Returns keywords for provided type. * * @param string $type Keyword type * * @return string */ protected function getKeywords($type) { if (!isset($this->keywordsCache[$type])) { $getter = 'get' . $type . 'Keywords'; $keywords = $this->keywords->$getter(); if ('Step' === $type) { $padded = array(); foreach (explode('|', $keywords) as $keyword) { $padded[] = false !== mb_strpos($keyword, '<', 0, 'utf8') ? preg_quote(mb_substr($keyword, 0, -1, 'utf8'), '/') . '\s*' : preg_quote($keyword, '/') . '\s+'; } $keywords = implode('|', $padded); } $this->keywordsCache[$type] = $keywords; } return $this->keywordsCache[$type]; } /** * Scans Feature from input & returns it if found. * * @return null|array */ protected function scanFeature() { return $this->scanInputForKeywords($this->getKeywords('Feature'), 'Feature'); } /** * Scans Background from input & returns it if found. * * @return null|array */ protected function scanBackground() { return $this->scanInputForKeywords($this->getKeywords('Background'), 'Background'); } /** * Scans Scenario from input & returns it if found. * * @return null|array */ protected function scanScenario() { return $this->scanInputForKeywords($this->getKeywords('Scenario'), 'Scenario'); } /** * Scans Scenario Outline from input & returns it if found. * * @return null|array */ protected function scanOutline() { return $this->scanInputForKeywords($this->getKeywords('Outline'), 'Outline'); } /** * Scans Scenario Outline Examples from input & returns it if found. * * @return null|array */ protected function scanExamples() { return $this->scanInputForKeywords($this->getKeywords('Examples'), 'Examples'); } /** * Scans Step from input & returns it if found. * * @return null|array */ protected function scanStep() { if (!$this->allowSteps) { return null; } $keywords = $this->getKeywords('Step'); if (!preg_match('/^\s*(' . $keywords . ')([^\s].+)/u', $this->line, $matches)) { return null; } $keyword = trim($matches[1]); $token = $this->takeToken('Step', $keyword); $token['keyword_type'] = $this->getStepKeywordType($keyword); $token['text'] = $matches[2]; $this->consumeLine(); $this->allowMultilineArguments = true; return $token; } /** * Scans PyString from input & returns it if found. * * @return null|array */ protected function scanPyStringOp() { if (!$this->allowMultilineArguments) { return null; } if (false === ($pos = mb_strpos($this->line, '"""', 0, 'utf8'))) { return null; } $this->inPyString = !$this->inPyString; $token = $this->takeToken('PyStringOp'); $this->pyStringSwallow = $pos; $this->consumeLine(); return $token; } /** * Scans PyString content. * * @return null|array */ protected function scanPyStringContent() { if (!$this->inPyString) { return null; } $token = $this->scanText(); // swallow trailing spaces $token['value'] = preg_replace('/^\s{0,' . $this->pyStringSwallow . '}/u', '', $token['value']); return $token; } /** * Scans Table Row from input & returns it if found. * * @return null|array */ protected function scanTableRow() { if (!$this->allowMultilineArguments) { return null; } $line = $this->getTrimmedLine(); if (!isset($line[0]) || '|' !== $line[0] || '|' !== substr($line, -1)) { return null; } $token = $this->takeToken('TableRow'); $line = mb_substr($line, 1, mb_strlen($line, 'utf8') - 2, 'utf8'); $columns = array_map(function ($column) { return trim(str_replace('\\|', '|', $column)); }, preg_split('/(?consumeLine(); return $token; } /** * Scans Tags from input & returns it if found. * * @return null|array */ protected function scanTags() { $line = $this->getTrimmedLine(); if (!isset($line[0]) || '@' !== $line[0]) { return null; } $token = $this->takeToken('Tag'); $tags = explode('@', mb_substr($line, 1, mb_strlen($line, 'utf8') - 1, 'utf8')); $tags = array_map('trim', $tags); $token['tags'] = $tags; $this->consumeLine(); return $token; } /** * Scans Language specifier from input & returns it if found. * * @return null|array */ protected function scanLanguage() { if ($this->featureStarted) { return null; } if ($this->inPyString) { return null; } if (0 !== mb_strpos(ltrim($this->line), '#', 0, 'utf8')) { return null; } return $this->scanInput('/^\s*\#\s*language:\s*([\w_\-]+)\s*$/', 'Language'); } /** * Scans Comment from input & returns it if found. * * @return null|array */ protected function scanComment() { if ($this->inPyString) { return null; } $line = $this->getTrimmedLine(); if (0 !== mb_strpos($line, '#', 0, 'utf8')) { return null; } $token = $this->takeToken('Comment', $line); $this->consumeLine(); return $token; } /** * Scans Newline from input & returns it if found. * * @return null|array */ protected function scanNewline() { if ('' !== $this->getTrimmedLine()) { return null; } $token = $this->takeToken('Newline', mb_strlen($this->line, 'utf8')); $this->consumeLine(); return $token; } /** * Scans text from input & returns it if found. * * @return null|array */ protected function scanText() { $token = $this->takeToken('Text', $this->line); $this->consumeLine(); return $token; } /** * Returns step type keyword (Given, When, Then, etc.). * * @param string $native Step keyword in provided language * @return string */ private function getStepKeywordType($native) { // Consider "*" as a AND keyword so that it is normalized to the previous step type if ('*' === $native) { return 'And'; } if (empty($this->stepKeywordTypesCache)) { $this->stepKeywordTypesCache = array( 'Given' => explode('|', $this->keywords->getGivenKeywords()), 'When' => explode('|', $this->keywords->getWhenKeywords()), 'Then' => explode('|', $this->keywords->getThenKeywords()), 'And' => explode('|', $this->keywords->getAndKeywords()), 'But' => explode('|', $this->keywords->getButKeywords()) ); } foreach ($this->stepKeywordTypesCache as $type => $keywords) { if (in_array($native, $keywords) || in_array($native . '<', $keywords)) { return $type; } } return 'Given'; } }