8 * Copyright (c) 2012-2013 Nicholas J Humfrey.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 * 3. The name of the author 'Nicholas J Humfrey" may be used to endorse or
19 * promote products derived from this software without specific prior
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
35 * @copyright Copyright (c) 2009-2013 Nicholas J Humfrey
36 * Copyright (c) 1997-2006 Aduna (http://www.aduna-software.com/)
37 * @license http://www.opensource.org/licenses/bsd-license.php
41 * Class to parse RDFa 1.1 with no external dependancies.
43 * http://www.w3.org/TR/rdfa-core/
46 * @copyright Copyright (c) 2012-2013 Nicholas J Humfrey
47 * @license http://www.opensource.org/licenses/bsd-license.php
49 class EasyRdf_Parser_Rdfa extends EasyRdf_Parser
51 const XML_NS = 'http://www.w3.org/XML/1998/namespace';
52 const RDF_XML_LITERAL = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral';
53 const TERM_REGEXP = '/^([a-zA-Z_])([0-9a-zA-Z_\.-]*)$/';
55 public $debug = false;
60 * @return object EasyRdf_Parser_Rdfa
62 public function __construct()
66 protected function addTriple($resource, $property, $value)
69 print "Adding triple: $resource -> $property -> ".$value['type'].':'.$value['value']."\n";
71 $count = $this->graph->add($resource, $property, $value);
72 $this->tripleCount += $count;
76 protected function generateList($subject, $property, $list)
81 // Output a blank node for each item in the list
82 foreach ($list as $item) {
83 $newNode = $this->graph->newBNodeId();
84 $this->addTriple($current, $prop, array('type' => 'bnode', 'value' => $newNode));
85 $this->addTriple($newNode, 'rdf:first', $item);
91 // Finally, terminate the list
95 array('type' => 'uri', 'value' => EasyRdf_Namespace::expand('rdf:nil'))
99 protected function addToList($listMapping, $property, $value)
102 print "Adding to list: $property -> ".$value['type'].':'.$value['value']."\n";
105 // Create property in the list mapping if it doesn't already exist
106 if (!isset($listMapping->$property)) {
107 $listMapping->$property = array();
109 array_push($listMapping->$property, $value);
112 protected function printNode($node, $depth)
114 $indent = str_repeat(' ', $depth);
116 switch($node->nodeType) {
117 case XML_ELEMENT_NODE:
120 case XML_ATTRIBUTE_NODE:
126 case XML_CDATA_SECTION_NODE:
129 case XML_ENTITY_REF_NODE:
132 case XML_ENTITY_NODE:
138 case XML_COMMENT_NODE:
141 case XML_DOCUMENT_NODE:
144 case XML_DOCUMENT_TYPE_NODE:
147 case XML_HTML_DOCUMENT_NODE:
151 throw new EasyRdf_Exception("unknown node type: ".$node->nodeType);
154 print ' '.$node->nodeName."\n";
156 if ($node->hasAttributes()) {
157 foreach ($node->attributes as $attr) {
158 print $indent.' '.$attr->nodeName." => ".$attr->nodeValue."\n";
163 protected function guessTimeDatatype($value)
165 if (preg_match('/^-?\d{4}-\d{2}-\d{2}(Z|[\-\+]\d{2}:\d{2})?$/', $value)) {
166 return 'http://www.w3.org/2001/XMLSchema#date';
167 } elseif (preg_match('/^\d{2}:\d{2}:\d{2}(Z|[\-\+]\d{2}:\d{2})?$/', $value)) {
168 return 'http://www.w3.org/2001/XMLSchema#time';
169 } elseif (preg_match('/^-?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(Z|[\-\+]\d{2}:\d{2})?$/', $value)) {
170 return 'http://www.w3.org/2001/XMLSchema#dateTime';
171 } elseif (preg_match('/^P(\d+Y)?(\d+M)?(\d+D)?T?(\d+H)?(\d+M)?(\d+S)?$/', $value)) {
172 return 'http://www.w3.org/2001/XMLSchema#duration';
173 } elseif (preg_match('/^\d{4}$/', $value)) {
174 return 'http://www.w3.org/2001/XMLSchema#gYear';
175 } elseif (preg_match('/^\d{4}-\d{2}$/', $value)) {
176 return 'http://www.w3.org/2001/XMLSchema#gYearMonth';
182 protected function initialContext()
185 'prefixes' => array(),
187 'subject' => $this->baseUri,
191 'incompleteRels' => array(),
192 'incompleteRevs' => array(),
193 'listMapping' => null,
199 // Set the default prefix
200 $context['prefixes'][''] = 'http://www.w3.org/1999/xhtml/vocab#';
202 // RDFa 1.1 default term mapping
203 $context['terms']['describedby'] = 'http://www.w3.org/2007/05/powder-s#describedby';
204 $context['terms']['license'] = 'http://www.w3.org/1999/xhtml/vocab#license';
205 $context['terms']['role'] = 'http://www.w3.org/1999/xhtml/vocab#role';
210 protected function expandCurie($node, &$context, $value)
212 if (preg_match('/^(\w*?):(.*)$/', $value, $matches)) {
213 list (, $prefix, $local) = $matches;
214 $prefix = strtolower($prefix);
215 if ($prefix === '_') {
217 return $this->remapBnode(substr($value, 2));
218 } elseif (empty($prefix) and $context['vocab']) {
220 return $context['vocab'] . $local;
221 } elseif (isset($context['prefixes'][$prefix])) {
222 return $context['prefixes'][$prefix] . $local;
223 } elseif ($uri = $node->lookupNamespaceURI($prefix)) {
224 return $uri . $local;
225 } elseif (!empty($prefix) and $uri = EasyRdf_Namespace::get($prefix)) {
226 // Expand using well-known prefixes
227 return $uri . $local;
232 protected function processUri($node, &$context, $value, $isProp = false)
234 if (preg_match('/^\[(.*)\]$/', $value, $matches)) {
236 return $this->expandCurie($node, $context, $matches[1]);
237 } elseif (preg_match(self::TERM_REGEXP, $value) and $isProp) {
238 $term = strtolower($value);
239 if ($context['vocab']) {
240 return $context['vocab'] . $value;
241 } elseif (isset($context['terms'][$term])) {
242 return $context['terms'][$term];
244 } elseif (substr($value, 0, 2) === '_:' and $isProp) {
247 $uri = $this->expandCurie($node, $context, $value);
251 $parsed = new EasyRdf_ParsedUri($value);
252 if ($parsed->isAbsolute()) {
255 // Properties can't be relative URIs
257 } elseif ($this->baseUri) {
258 return $this->baseUri->resolve($parsed);
264 protected function processUriList($node, $context, $values)
271 foreach (preg_split('/\s+/', $values) as $value) {
272 $uri = $this->processUri($node, $context, $value, true);
274 array_push($uris, $uri);
280 protected function getUriAttribute($node, &$context, $attributes)
282 if (!is_array($attributes)) {
283 $attributes = array($attributes);
286 // Find the first attribute that returns a valid URI
287 foreach ($attributes as $attribute) {
288 if ($node->hasAttribute($attribute)) {
289 $value = $node->getAttribute($attribute);
290 $uri = $this->processUri($node, $context, $value);
298 protected function processNode($node, &$context, $depth = 1)
301 $this->printNode($node, $depth);
304 // Step 1: establish local variables
307 $typedResource = null;
311 $lang = $context['lang'];
312 $incompleteRels = array();
313 $incompleteRevs = array();
315 if ($node->nodeType === XML_ELEMENT_NODE) {
316 $context['path'] .= '/' . $node->nodeName;
318 $content = $node->hasAttribute('content') ? $node->getAttribute('content') : null;
319 $datatype = $node->hasAttribute('datatype') ? $node->getAttribute('datatype') : null;
320 $property = $node->getAttribute('property') ? $node->getAttribute('property') : null;
321 $typeof = $node->getAttribute('typeof') ? $node->getAttribute('typeof') : null;
323 // Step 2: Default vocabulary
324 if ($node->hasAttribute('vocab')) {
325 $context['vocab'] = $node->getAttribute('vocab');
326 if ($context['vocab']) {
329 'rdfa:usesVocabulary',
330 array('type' => 'uri', 'value' => $context['vocab'])
335 // Step 3: Set prefix mappings
336 // Support for deprecated xmlns if present in document
337 foreach ($context['xmlns'] as $prefix => $uri) {
338 if ($node->hasAttribute('xmlns:' . $prefix)) {
339 $context['prefixes'][$prefix] = $node->getAttribute('xmlns:' . $prefix);
341 print "Prefix (xmlns): $prefix => $uri\n";
345 if ($node->hasAttribute('prefix')) {
346 $mappings = preg_split('/\s+/', $node->getAttribute('prefix'));
347 while (count($mappings)) {
348 $prefix = strtolower(array_shift($mappings));
349 $uri = array_shift($mappings);
351 if (substr($prefix, -1) === ':') {
352 $prefix = substr($prefix, 0, -1);
357 if ($prefix === '_') {
359 } elseif (!empty($prefix)) {
360 $context['prefixes'][$prefix] = $uri;
362 print "Prefix: $prefix => $uri\n";
369 if ($node->hasAttributeNS(self::XML_NS, 'lang')) {
370 $lang = $node->getAttributeNS(self::XML_NS, 'lang');
371 } elseif ($node->hasAttribute('lang')) {
372 $lang = $node->getAttribute('lang');
375 // HTML+RDFa 1.1: ignore rel and rev unless they contain CURIEs.
376 foreach (array('rel', 'rev') as $attr) {
377 if ($node->hasAttribute('property') and $node->hasAttribute($attr)) {
378 // Quick check in case there are no CURIEs to deal with.
379 if (strpos($node->getAttribute($attr), ':') === false) {
380 $node->removeAttribute($attr);
384 foreach (preg_split('/\s+/', $node->getAttribute($attr)) as $token) {
385 if (strpos($token, ':')) {
389 $node->setAttribute($attr, implode(' ', $curies));
394 $rels = $this->processUriList($node, $context, $node->getAttribute('rel'));
395 $revs = $this->processUriList($node, $context, $node->getAttribute('rev'));
397 if (!$node->hasAttribute('rel') and !$node->hasAttribute('rev')) {
398 // Step 5: Establish a new subject if no rel/rev
399 if ($property and is_null($content) and is_null($datatype)) {
400 $subject = $this->getUriAttribute($node, $context, 'about');
401 if ($typeof and !$subject) {
402 $typedResource = $this->getUriAttribute(
405 array('resource', 'href', 'src')
407 if (!$typedResource) {
408 $typedResource = $this->graph->newBNodeId();
410 $object = $typedResource;
413 $subject = $this->getUriAttribute(
416 array('about', 'resource', 'href', 'src')
420 // Establish a subject if there isn't one
421 # FIXME: refactor this
422 if (is_null($subject)) {
423 if ($context['path'] === '/html/head') {
424 $subject = $context['object'];
425 } elseif ($depth <= 2) {
426 $subject = $this->baseUri;
427 } elseif ($typeof and !$property) {
428 $subject = $this->graph->newBNodeId();
433 $subject = $context['object'];
439 // If the current element does contain a @rel or @rev attribute, then the next step is to
440 // establish both a value for new subject and a value for current object resource:
442 $subject = $this->getUriAttribute($node, $context, 'about');
444 $object = $this->getUriAttribute(
447 array('resource', 'href', 'src')
451 if (!$object and !$subject) {
452 $object = $this->graph->newBNodeId();
454 $typedResource = $subject ? $subject : $object;
457 # FIXME: if the element is the root element of the document
458 # then act as if there is an empty @about present
460 $subject = $context['object'];
465 # FIXME: better place for this?
466 if ($typeof and $subject and !$typedResource) {
467 $typedResource = $subject;
470 // Step 7: Process @typeof if there is a subject
471 if ($typedResource) {
472 foreach ($this->processUriList($node, $context, $typeof) as $type) {
476 array('type' => 'uri', 'value' => $type)
481 // Step 8: Create new List mapping if the subject has changed
482 if ($subject and $subject !== $context['subject']) {
483 $listMapping = new StdClass();
485 $listMapping = $context['listMapping'];
488 // Step 9: Generate triples with given object
489 if ($subject and $object) {
490 foreach ($rels as $prop) {
491 $obj = array('type' => 'uri', 'value' => $object);
492 if ($node->hasAttribute('inlist')) {
493 $this->addToList($listMapping, $prop, $obj);
495 $this->addTriple($subject, $prop, $obj);
499 foreach ($revs as $prop) {
503 array('type' => 'uri', 'value' => $subject)
506 } elseif ($rels or $revs) {
507 // Step 10: Incomplete triples and bnode creation
508 $object = $this->graph->newBNodeId();
510 if ($node->hasAttribute('inlist')) {
511 foreach ($rels as $prop) {
512 # FIXME: add support for incomplete lists
513 if (!isset($listMapping->$prop)) {
514 $listMapping->$prop = array();
518 $incompleteRels = $rels;
520 print "Incomplete rels: ".implode(',', $rels)."\n";
526 $incompleteRevs = $revs;
528 print "Incomplete revs: ".implode(',', $revs)."\n";
533 // Step 11: establish current property value
534 if ($subject and $property) {
538 $datatype = $this->processUri($node, $context, $datatype, true);
541 if ($content !== null) {
542 $value['value'] = $content;
543 } elseif ($node->hasAttribute('datetime')) {
544 $value['value'] = $node->getAttribute('datetime');
546 } elseif ($datatype === '') {
547 $value['value'] = $node->textContent;
548 } elseif ($datatype === self::RDF_XML_LITERAL) {
549 $value['value'] = '';
550 foreach ($node->childNodes as $child) {
551 $value['value'] .= $child->C14N();
553 } elseif (is_null($datatype) and empty($rels) and empty($revs)) {
554 $value['value'] = $this->getUriAttribute(
557 array('resource', 'href', 'src')
560 if ($value['value']) {
561 $value['type'] = 'uri';
565 if (empty($value['value']) and $typedResource and !$node->hasAttribute('about')) {
566 $value['type'] = 'uri';
567 $value['value'] = $typedResource;
570 if (empty($value['value'])) {
571 $value['value'] = $node->textContent;
574 if (empty($value['type'])) {
575 $value['type'] = 'literal';
577 $value['datatype'] = $datatype;
578 } elseif (isset($datetime) or $node->nodeName === 'time') {
579 $value['datatype'] = $this->guessTimeDatatype($value['value']);
582 if (empty($value['datatype']) and $lang) {
583 $value['lang'] = $lang;
587 // Add each of the properties
588 foreach ($this->processUriList($node, $context, $property) as $prop) {
589 if ($node->hasAttribute('inlist')) {
590 $this->addToList($listMapping, $prop, $value);
591 } elseif ($subject) {
592 $this->addTriple($subject, $prop, $value);
597 // Step 12: Complete the incomplete triples from the evaluation context
598 if (!$skip and $subject and ($context['incompleteRels'] or $context['incompleteRevs'])) {
599 foreach ($context['incompleteRels'] as $prop) {
603 array('type' => 'uri', 'value' => $subject)
607 foreach ($context['incompleteRevs'] as $prop) {
611 array('type' => 'uri', 'value' => $context['subject'])
617 // Step 13: create a new evaluation context and proceed recursively
618 if ($node->hasChildNodes()) {
620 $newContext = $context;
622 // Prepare a new evaluation context
623 $newContext = $context;
625 $newContext['object'] = $object;
626 } elseif ($subject) {
627 $newContext['object'] = $subject;
629 $newContext['object'] = $context['subject'];
632 $newContext['subject'] = $subject;
634 $newContext['incompleteRels'] = $incompleteRels;
635 $newContext['incompleteRevs'] = $incompleteRevs;
636 if (isset($listMapping)) {
637 $newContext['listMapping'] = $listMapping;
641 // The language is always updated, even if skip is set
642 $newContext['lang'] = $lang;
644 foreach ($node->childNodes as $child) {
645 if ($child->nodeType === XML_ELEMENT_NODE) {
646 $this->processNode($child, $newContext, $depth+1);
651 // Step 14: create triples for lists
652 if (!empty($listMapping)) {
653 foreach ($listMapping as $prop => $list) {
654 if ($context['listMapping'] !== $listMapping) {
656 print "Need to create triples for $prop => ".count($list)." items\n";
658 $this->generateList($subject, $prop, $list);
665 * Parse RDFa 1.1 into an EasyRdf_Graph
667 * @param object EasyRdf_Graph $graph the graph to load the data into
668 * @param string $data the RDF document data
669 * @param string $format the format of the input data
670 * @param string $baseUri the base URI of the data being parsed
671 * @return integer The number of triples added to the graph
673 public function parse($graph, $data, $format, $baseUri)
675 parent::checkParseParams($graph, $data, $format, $baseUri);
677 if ($format != 'rdfa') {
678 throw new EasyRdf_Exception(
679 "EasyRdf_Parser_Rdfa does not support: $format"
683 // Initialise evaluation context.
684 $context = $this->initialContext();
686 libxml_use_internal_errors(true);
688 // Parse the document into DOM
689 $doc = new DOMDocument();
690 // Attempt to parse the document as strict XML, and fall back to HTML
691 // if XML parsing fails.
692 if ($doc->loadXML($data, LIBXML_NONET)) {
694 print "Document was parsed as XML.";
696 // Collect all xmlns namespaces defined throughout the document.
697 $sxe = simplexml_import_dom($doc);
698 $context['xmlns'] = $sxe->getDocNamespaces(true);
699 unset($context['xmlns']['']);
701 $doc->loadHTML($data);
703 print "Document was parsed as HTML.";
707 // Establish the base for both XHTML and HTML documents.
708 $xpath = new DOMXPath($doc);
709 $xpath->registerNamespace('xh', "http://www.w3.org/1999/xhtml");
710 $nodeList = $xpath->query('/xh:html/xh:head/xh:base');
711 if ($node = $nodeList->item(0) and $href = $node->getAttribute('href')) {
712 $this->baseUri = new EasyRdf_ParsedUri($href);
714 $nodeList = $xpath->query('/html/head/base');
715 if ($node = $nodeList->item(0) and $href = $node->getAttribute('href')) {
716 $this->baseUri = new EasyRdf_ParsedUri($href);
719 // Remove the fragment from the base URI
720 $this->baseUri->setFragment(null);
722 // Recursively process XML nodes
723 $this->processNode($doc, $context);
725 return $this->tripleCount;