8 * Copyright (c) 2009-2013 Nicholas J Humfrey.
9 * Copyright (c) 1997-2013 Aduna (http://www.aduna-software.com/)
10 * All rights reserved.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright notice,
17 * this list of conditions and the following disclaimer in the documentation
18 * and/or other materials provided with the distribution.
19 * 3. The name of the author 'Nicholas J Humfrey" may be used to endorse or
20 * promote products derived from this software without specific prior
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
36 * @copyright Copyright (c) 2009-2013 Nicholas J Humfrey
37 * Copyright (c) 1997-2006 Aduna (http://www.aduna-software.com/)
38 * @license http://www.opensource.org/licenses/bsd-license.php
42 * Class to parse Turtle with no external dependancies.
44 * It is a translation from Java to PHP of the Sesame Turtle Parser:
45 * http://bit.ly/TurtleParser
47 * Lasted updated against version:
48 * ecda6a15a200a2fc6a062e2e43081257c3ccd4e6 (Mon Jul 29 12:05:58 2013)
51 * @copyright Copyright (c) 2009-2013 Nicholas J Humfrey
52 * Copyright (c) 1997-2013 Aduna (http://www.aduna-software.com/)
53 * @license http://www.opensource.org/licenses/bsd-license.php
55 class EasyRdf_Parser_Turtle extends EasyRdf_Parser_Ntriples
58 protected $namespaces;
69 * @return object EasyRdf_Parser_Turtle
71 public function __construct()
76 * Parse Turtle into an EasyRdf_Graph
78 * @param object EasyRdf_Graph $graph the graph to load the data into
79 * @param string $data the RDF document data
80 * @param string $format the format of the input data
81 * @param string $baseUri the base URI of the data being parsed
82 * @return integer The number of triples added to the graph
84 public function parse($graph, $data, $format, $baseUri)
86 parent::checkParseParams($graph, $data, $format, $baseUri);
88 if ($format != 'turtle') {
89 throw new EasyRdf_Exception(
90 "EasyRdf_Parser_Turtle does not support: $format"
95 $this->namespaces = array();
96 $this->subject = null;
97 $this->predicate = null;
103 $this->resetBnodeMap();
105 $c = $this->skipWSC();
107 $this->parseStatement();
108 $c = $this->skipWSC();
111 return $this->tripleCount;
116 * Parse a statement [2]
119 protected function parseStatement()
124 if ($c == -1 || self::isWhitespace($c)) {
132 if (preg_match('/^(@|prefix$|base$)/i', $directive)) {
133 $this->parseDirective($directive);
135 // SPARQL BASE and PREFIX lines do not end in .
136 if ($directive[0] == "@") {
137 $this->verifyCharacterOrFail($this->read(), ".");
140 $this->unread($directive);
141 $this->parseTriples();
143 $this->verifyCharacterOrFail($this->read(), ".");
148 * Parse a directive [3]
151 protected function parseDirective($directive)
153 $directive = strtolower($directive);
154 if ($directive == "prefix" || $directive == '@prefix') {
155 $this->parsePrefixID();
156 } elseif ($directive == "base" || $directive == '@base') {
158 } elseif (mb_strlen($directive, "UTF-8") == 0) {
159 throw new EasyRdf_Parser_Exception(
160 "Turtle Parse Error: directive name is missing, expected @prefix or @base",
165 throw new EasyRdf_Parser_Exception(
166 "Turtle Parse Error: unknown directive \"$directive\"",
174 * Parse a prefixID [4]
177 protected function parsePrefixID()
181 // Read prefix ID (e.g. "rdf:" or ":")
189 } elseif (self::isWhitespace($c)) {
191 } elseif ($c == -1) {
192 throw new EasyRdf_Parser_Exception(
193 "Turtle Parse Error: unexpected end of file while reading prefix id",
203 $this->verifyCharacterOrFail($this->read(), ":");
206 // Read the namespace URI
207 $namespace = $this->parseURI();
209 // Store local namespace mapping
210 $this->namespaces[$prefixID] = $namespace['value'];
217 protected function parseBase()
221 $baseUri = $this->parseURI();
222 $this->baseUri = new EasyRdf_ParsedUri($baseUri['value']);
229 protected function parseTriples()
233 // If the first character is an open bracket we need to decide which of
234 // the two parsing methods for blank nodes to use
241 $this->subject = $this->createBNode();
243 $this->parsePredicateObjectList();
246 $this->subject = $this->parseImplicitBlank();
251 // if this is not the end of the statement, recurse into the list of
252 // predicate and objects, using the subject parsed above as the subject
255 $this->parsePredicateObjectList();
258 $this->parseSubject();
260 $this->parsePredicateObjectList();
263 $this->subject = null;
264 $this->predicate = null;
265 $this->object = null;
269 * Parse a predicateObjectList [7]
272 protected function parsePredicateObjectList()
274 $this->predicate = $this->parsePredicate();
277 $this->parseObjectList();
279 while ($this->skipWSC() == ';') {
282 $c = $this->skipWSC();
284 if ($c == '.' || $c == ']') {
286 } elseif ($c == ';') {
287 // empty predicateObjectList, skip to next
291 $this->predicate = $this->parsePredicate();
295 $this->parseObjectList();
300 * Parse a objectList [8]
303 protected function parseObjectList()
305 $this->parseObject();
307 while ($this->skipWSC() == ',') {
310 $this->parseObject();
315 * Parse a subject [10]
318 protected function parseSubject()
322 $this->subject = $this->parseCollection();
323 } elseif ($c == '[') {
324 $this->subject = $this->parseImplicitBlank();
326 $value = $this->parseValue();
328 if ($value['type'] == 'uri' or $value['type'] == 'bnode') {
329 $this->subject = $value;
331 throw new EasyRdf_Parser_Exception(
332 "Turtle Parse Error: illegal subject type: ".$value['type'],
341 * Parse a predicate [11]
344 protected function parsePredicate()
346 // Check if the short-cut 'a' is used
352 if (self::isWhitespace($c2)) {
353 // Short-cut is used, return the rdf:type URI
356 'value' => EasyRdf_Namespace::get('rdf') . 'type'
360 // Short-cut is not used, unread all characters
365 // Predicate is a normal resource
366 $predicate = $this->parseValue();
367 if ($predicate['type'] == 'uri') {
370 throw new EasyRdf_Parser_Exception(
371 "Turtle Parse Error: Illegal predicate type: " . $predicate['type'],
379 * Parse a object [12]
382 protected function parseObject()
387 $this->object = $this->parseCollection();
388 } elseif ($c == '[') {
389 $this->object = $this->parseImplicitBlank();
391 $this->object = $this->parseValue();
395 $this->subject['value'],
396 $this->predicate['value'],
402 * Parses a blankNodePropertyList [15]
404 * This method parses the token []
405 * and predicateObjectLists that are surrounded by square brackets.
409 protected function parseImplicitBlank()
411 $this->verifyCharacterOrFail($this->read(), "[");
413 $bnode = $this->createBNode();
419 // Remember current subject and predicate
420 $oldSubject = $this->subject;
421 $oldPredicate = $this->predicate;
423 // generated bNode becomes subject
424 $this->subject = $bnode;
426 // Enter recursion with nested predicate-object list
429 $this->parsePredicateObjectList();
433 // Read closing bracket
434 $this->verifyCharacterOrFail($this->read(), "]");
436 // Restore previous subject and predicate
437 $this->subject = $oldSubject;
438 $this->predicate = $oldPredicate;
445 * Parses a collection [16], e.g: ( item1 item2 item3 )
448 protected function parseCollection()
450 $this->verifyCharacterOrFail($this->read(), "(");
452 $c = $this->skipWSC();
458 'value' => EasyRdf_Namespace::get('rdf') . 'nil'
461 $listRoot = $this->createBNode();
463 // Remember current subject and predicate
464 $oldSubject = $this->subject;
465 $oldPredicate = $this->predicate;
467 // generated bNode becomes subject, predicate becomes rdf:first
468 $this->subject = $listRoot;
469 $this->predicate = array(
471 'value' => EasyRdf_Namespace::get('rdf') . 'first'
474 $this->parseObject();
477 while ($this->skipWSC() != ')') {
478 // Create another list node and link it to the previous
479 $newNode = $this->createBNode();
483 EasyRdf_Namespace::get('rdf') . 'rest',
487 // New node becomes the current
488 $this->subject = $bNode = $newNode;
490 $this->parseObject();
499 EasyRdf_Namespace::get('rdf') . 'rest',
502 'value' => EasyRdf_Namespace::get('rdf') . 'nil'
506 // Restore previous subject and predicate
507 $this->subject = $oldSubject;
508 $this->predicate = $oldPredicate;
515 * Parses an RDF value. This method parses uriref, qname, node ID, quoted
516 * literal, integer, double and boolean.
519 protected function parseValue()
524 // uriref, e.g. <foo://bar>
525 return $this->parseURI();
526 } elseif ($c == ':' || self::isPrefixStartChar($c)) {
528 return $this->parseQNameOrBoolean();
529 } elseif ($c == '_') {
530 // node ID, e.g. _:n1
531 return $this->parseNodeID();
532 } elseif ($c == '"' || $c == "'") {
533 // quoted literal, e.g. "foo" or """foo""" or 'foo' or '''foo'''
534 return $this->parseQuotedLiteral();
535 } elseif (ctype_digit($c) || $c == '.' || $c == '+' || $c == '-') {
536 // integer or double, e.g. 123 or 1.2e3
537 return $this->parseNumber();
538 } elseif ($c == -1) {
539 throw new EasyRdf_Parser_Exception(
540 "Turtle Parse Error: unexpected end of file while reading value",
545 throw new EasyRdf_Parser_Exception(
546 "Turtle Parse Error: expected an RDF value here, found '$c'",
554 * Parses a quoted string, optionally followed by a language tag or datatype.
557 protected function parseQuotedLiteral()
559 $label = $this->parseQuotedString();
561 // Check for presence of a language tag or datatype
571 throw new EasyRdf_Parser_Exception(
572 "Turtle Parse Error: unexpected end of file while reading language",
576 } elseif (!self::isLanguageStartChar($c)) {
577 throw new EasyRdf_Parser_Exception(
578 "Turtle Parse Error: expected a letter, found '$c'",
587 while (!self::isWhitespace($c)) {
588 if ($c == '.' || $c == ';' || $c == ',' || $c == ')' || $c == ']' || $c == -1) {
591 if (self::isLanguageChar($c)) {
594 throw new EasyRdf_Parser_Exception(
595 "Turtle Parse Error: illegal language tag char: '$c'",
610 } elseif ($c == '^') {
613 // next character should be another '^'
614 $this->verifyCharacterOrFail($this->read(), "^");
617 $datatype = $this->parseValue();
618 if ($datatype['type'] == 'uri') {
622 'datatype' => $datatype['value']
625 throw new EasyRdf_Parser_Exception(
626 "Turtle Parse Error: illegal datatype type: " . $datatype['type'],
640 * Parses a quoted string, which is either a "normal string" or a """long string""".
643 protected function parseQuotedString()
649 // First character should be ' or "
650 $this->verifyCharacterOrFail($c1, "\"\'");
652 // Check for long-string, which starts and ends with three double quotes
656 if ($c2 == $c1 && $c3 == $c1) {
658 $result = $this->parseLongString($c2);
664 $result = $this->parseString($c1);
667 // Unescape any escape sequences
668 return $this->unescapeString($result);
672 * Parses a "normal string". This method requires that the opening character
673 * has already been parsed.
674 * @param string $closingCharacter The type of quote to use (either ' or ")
677 protected function parseString($closingCharacter)
684 if ($c == $closingCharacter) {
686 } elseif ($c == -1) {
687 throw new EasyRdf_Parser_Exception(
688 "Turtle Parse Error: unexpected end of file while reading string",
697 // This escapes the next character, which might be a ' or a "
700 throw new EasyRdf_Parser_Exception(
701 "Turtle Parse Error: unexpected end of file while reading string",
714 * Parses a """long string""". This method requires that the first three
715 * characters have already been parsed.
716 * @param string $closingCharacter The type of quote to use (either ' or ")
719 protected function parseLongString($closingCharacter)
722 $doubleQuoteCount = 0;
724 while ($doubleQuoteCount < 3) {
728 throw new EasyRdf_Parser_Exception(
729 "Turtle Parse Error: unexpected end of file while reading long string",
733 } elseif ($c == $closingCharacter) {
736 $doubleQuoteCount = 0;
742 // This escapes the next character, which might be a ' or "
745 throw new EasyRdf_Parser_Exception(
746 "Turtle Parse Error: unexpected end of file while reading long string",
755 return mb_substr($str, 0, -3, "UTF-8");
759 * Parses a numeric value, either of type integer, decimal or double
762 protected function parseNumber()
765 $datatype = EasyRdf_Namespace::get('xsd').'integer';
769 // read optional sign character
770 if ($c == '+' || $c == '-') {
775 while (ctype_digit($c)) {
780 if ($c == '.' || $c == 'e' || $c == 'E') {
781 // read optional fractional digits
784 if (self::isWhitespace($this->peek())) {
785 // We're parsing an integer that did not have a space before the
786 // period to end the statement
790 while (ctype_digit($c)) {
795 if (mb_strlen($value, "UTF-8") == 1) {
796 // We've only parsed a '.'
797 throw new EasyRdf_Parser_Exception(
798 "Turtle Parse Error: object for statement missing",
804 // We're parsing a decimal or a double
805 $datatype = EasyRdf_Namespace::get('xsd').'decimal';
808 if (mb_strlen($value, "UTF-8") == 0) {
809 // We've only parsed an 'e' or 'E'
810 throw new EasyRdf_Parser_Exception(
811 "Turtle Parse Error: object for statement missing",
818 // read optional exponent
819 if ($c == 'e' || $c == 'E') {
820 $datatype = EasyRdf_Namespace::get('xsd').'double';
824 if ($c == '+' || $c == '-') {
829 if (!ctype_digit($c)) {
830 throw new EasyRdf_Parser_Exception(
831 "Turtle Parse Error: exponent value missing",
840 while (ctype_digit($c)) {
847 // Unread last character, it isn't part of the number
850 // Return result as a typed literal
854 'datatype' => $datatype
862 protected function parseURI()
866 // First character should be '<'
867 $this->verifyCharacterOrFail($this->read(), "<");
869 // Read up to the next '>' character
875 } elseif ($c == -1) {
876 throw new EasyRdf_Parser_Exception(
877 "Turtle Parse Error: unexpected end of file while reading URI",
886 // This escapes the next character, which might be a '>'
889 throw new EasyRdf_Parser_Exception(
890 "Turtle Parse Error: unexpected end of file while reading URI",
899 // Unescape any escape sequences
900 $uri = $this->unescapeString($uri);
904 'value' => $this->resolve($uri)
909 * Parses qnames and boolean values, which have equivalent starting
913 protected function parseQNameOrBoolean()
915 // First character should be a ':' or a letter
918 throw new EasyRdf_Parser_Exception(
919 "Turtle Parse Error: unexpected end of file while readying value",
924 if ($c != ':' && !self::isPrefixStartChar($c)) {
925 throw new EasyRdf_Parser_Exception(
926 "Turtle Parse Error: expected a ':' or a letter, found '$c'",
935 // qname using default namespace
936 if (isset($this->namespaces[''])) {
937 $namespace = $this->namespaces[''];
939 throw new EasyRdf_Parser_Exception(
940 "Turtle Parse Error: default namespace used but not defined",
946 // $c is the first letter of the prefix
950 while (self::isPrefixChar($c)) {
956 // prefix may actually be a boolean value
959 if ($value == "true" || $value == "false") {
963 'datatype' => EasyRdf_Namespace::get('xsd') . 'boolean'
968 $this->verifyCharacterOrFail($c, ":");
970 if (isset($this->namespaces[$prefix])) {
971 $namespace = $this->namespaces[$prefix];
973 throw new EasyRdf_Parser_Exception(
974 "Turtle Parse Error: namespace prefix '$prefix' used but not defined",
981 // $c == ':', read optional local name
984 if (self::isNameStartChar($c)) {
986 $localName .= $this->readLocalEscapedChar();
992 while (self::isNameChar($c)) {
994 $localName .= $this->readLocalEscapedChar();
1002 // Unread last character
1005 // Note: namespace has already been resolved
1008 'value' => $namespace . $localName
1012 protected function readLocalEscapedChar()
1016 if (self::isLocalEscapedChar($c)) {
1019 throw new EasyRdf_Parser_Exception(
1020 "found '" . $c . "', expected one of: " . implode(', ', self::$localEscapedChars),
1028 * Parses a blank node ID, e.g: _:node1
1031 protected function parseNodeID()
1033 // Node ID should start with "_:"
1034 $this->verifyCharacterOrFail($this->read(), "_");
1035 $this->verifyCharacterOrFail($this->read(), ":");
1040 throw new EasyRdf_Parser_Exception(
1041 "Turtle Parse Error: unexpected end of file while reading node id",
1045 } elseif (!self::isNameStartChar($c)) {
1046 throw new EasyRdf_Parser_Exception(
1047 "Turtle Parse Error: expected a letter, found '$c'",
1053 // Read all following letter and numbers, they are part of the name
1056 while (self::isNameChar($c)) {
1065 'value' => $this->remapBnode($name)
1069 protected function resolve($uri)
1071 if ($this->baseUri) {
1072 return $this->baseUri->resolve($uri)->toString();
1079 * Verifies that the supplied character $c is one of the expected
1080 * characters specified in $expected. This method will throw a
1081 * exception if this is not the case.
1084 protected function verifyCharacterOrFail($c, $expected)
1087 throw new EasyRdf_Parser_Exception(
1088 "Turtle Parse Error: unexpected end of file",
1092 } elseif (strpbrk($c, $expected) === false) {
1094 for ($i = 0; $i < strlen($expected); $i++) {
1098 $msg .= '\''.$expected[$i].'\'';
1100 $msg .= ", found '$c'";
1102 throw new EasyRdf_Parser_Exception(
1103 "Turtle Parse Error: $msg",
1111 * Skip through whitespace and comments
1114 protected function skipWSC()
1117 while (self::isWhitespace($c) || $c == '#') {
1119 $this->processComment();
1130 * Consumes characters from reader until the first EOL has been read.
1133 protected function processComment()
1137 while ($c != -1 && $c != "\r" && $c != "\n") {
1142 // c is equal to -1, \r or \n.
1143 // In case c is equal to \r, we should also read a following \n.
1153 * Read a single character from the input buffer.
1154 * Returns -1 when the end of the file is reached.
1157 protected function read()
1159 if (!empty($this->data)) {
1160 $c = mb_substr($this->data, 0, 1, "UTF-8");
1161 // Keep tracks of which line we are on (0A = Line Feed)
1169 if (version_compare(PHP_VERSION, '5.4.8', '<')) {
1170 // versions of PHP prior to 5.4.8 treat "NULL" length parameter as 0
1171 $this->data = mb_substr($this->data, 1, mb_strlen($this->data), "UTF-8");
1173 $this->data = mb_substr($this->data, 1, null, "UTF-8");
1182 * Gets the next character to be returned by read()
1183 * without removing it from the input buffer.
1186 protected function peek()
1188 if (!empty($this->data)) {
1189 return mb_substr($this->data, 0, 1, "UTF-8");
1197 * Steps back, restoring the previous character read() to the input buffer
1200 protected function unread($c)
1202 # FIXME: deal with unreading new lines
1203 $this->column -= mb_strlen($c, "UTF-8");
1204 $this->data = $c . $this->data;
1208 protected function createBNode()
1212 'value' => $this->graph->newBNodeId()
1217 * Returns true if $c is a whitespace character
1220 public static function isWhitespace($c)
1222 // Whitespace character are space, tab, newline and carriage return:
1223 return $c == "\x20" || $c == "\x09" || $c == "\x0A" || $c == "\x0D";
1227 public static function isPrefixStartChar($c)
1231 $o >= 0x41 && $o <= 0x5a || # A-Z
1232 $o >= 0x61 && $o <= 0x7a || # a-z
1233 $o >= 0x00C0 && $o <= 0x00D6 ||
1234 $o >= 0x00D8 && $o <= 0x00F6 ||
1235 $o >= 0x00F8 && $o <= 0x02FF ||
1236 $o >= 0x0370 && $o <= 0x037D ||
1237 $o >= 0x037F && $o <= 0x1FFF ||
1238 $o >= 0x200C && $o <= 0x200D ||
1239 $o >= 0x2070 && $o <= 0x218F ||
1240 $o >= 0x2C00 && $o <= 0x2FEF ||
1241 $o >= 0x3001 && $o <= 0xD7FF ||
1242 $o >= 0xF900 && $o <= 0xFDCF ||
1243 $o >= 0xFDF0 && $o <= 0xFFFD ||
1244 $o >= 0x10000 && $o <= 0xEFFFF;
1248 public static function isNameStartChar($c)
1256 self::isPrefixStartChar($c);
1260 public static function isNameChar($c)
1264 self::isNameStartChar($c) ||
1265 $o >= 0x30 && $o <= 0x39 || # 0-9
1268 $o >= 0x0300 && $o <= 0x036F ||
1269 $o >= 0x203F && $o <= 0x2040;
1273 private static $localEscapedChars = array(
1274 '_', '~', '.', '-', '!', '$', '&', '\'', '(', ')',
1275 '*', '+', ',', ';', '=', '/', '?', '#', '@', '%'
1279 public static function isLocalEscapedChar($c)
1281 return in_array($c, self::$localEscapedChars);
1285 public static function isPrefixChar($c)
1290 $o >= 0x30 && $o <= 0x39 || # 0-9
1291 self::isPrefixStartChar($c) ||
1294 $c >= 0x0300 && $c <= 0x036F ||
1295 $c >= 0x203F && $c <= 0x2040;
1299 public static function isLanguageStartChar($c)
1303 $o >= 0x41 && $o <= 0x5a || # A-Z
1304 $o >= 0x61 && $o <= 0x7a; # a-z
1308 public static function isLanguageChar($c)
1312 $o >= 0x41 && $o <= 0x5a || # A-Z
1313 $o >= 0x61 && $o <= 0x7a || # a-z
1314 $o >= 0x30 && $o <= 0x39 || # 0-9