X-Git-Url: https://yaffs.net/gitweb/?a=blobdiff_plain;f=web%2Fmodules%2Fcontrib%2Fmigrate_plus%2Fsrc%2FPlugin%2Fmigrate_plus%2Fdata_parser%2FXml.php;fp=web%2Fmodules%2Fcontrib%2Fmigrate_plus%2Fsrc%2FPlugin%2Fmigrate_plus%2Fdata_parser%2FXml.php;h=0213a0f7bb4df3454ff0d88de44168569a883ae6;hb=a2bd1bf0c2c1f1a17d188f4dc0726a45494cefae;hp=0000000000000000000000000000000000000000;hpb=57c063afa3f66b07c4bbddc2d6129a96d90f0aad;p=yaffs-website diff --git a/web/modules/contrib/migrate_plus/src/Plugin/migrate_plus/data_parser/Xml.php b/web/modules/contrib/migrate_plus/src/Plugin/migrate_plus/data_parser/Xml.php new file mode 100644 index 000000000..0213a0f7b --- /dev/null +++ b/web/modules/contrib/migrate_plus/src/Plugin/migrate_plus/data_parser/Xml.php @@ -0,0 +1,314 @@ + 'file', 1 => 'article']. + * + * @var array + */ + protected $elementsToMatch = []; + + /** + * An optional xpath predicate. + * + * Restricts the matching elements based on values in their children. Parsed + * from the element query at construct time. + * + * @var string + */ + protected $xpathPredicate = NULL; + + /** + * Array representing the path to the current element as we traverse the XML. + * + * For example, if in an XML string like '
...
' + * we are positioned within the article element, currentPath will be + * [0 => 'file', 1 => 'article']. + * + * @var array + */ + protected $currentPath = []; + + /** + * Retains all elements with a given name to support extraction from parents. + * + * This is a hack to support field extraction of values in parents + * of the 'context node' - ie, if $this->fields() has something like '..\nid'. + * Since we are using a streaming xml processor, it is too late to snoop + * around parent elements again once we've located an element of interest. So, + * grab elements with matching names and their depths, and refer back to it + * when building the source row. + * + * @var array + */ + protected $parentXpathCache = []; + + /** + * Hash of the element names that should be captured into $parentXpathCache. + * + * @var array + */ + protected $parentElementsOfInterest = []; + + /** + * Element name matching mode. + * + * When matching element names, whether to compare to the namespace-prefixed + * name, or the local name. + * + * @var bool + */ + protected $prefixedName = FALSE; + + /** + * {@inheritdoc} + */ + public function __construct(array $configuration, $plugin_id, $plugin_definition) { + parent::__construct($configuration, $plugin_id, $plugin_definition); + + $this->reader = new \XMLReader(); + + // Suppress errors during parsing, so we can pick them up after. + libxml_use_internal_errors(TRUE); + + // Parse the element query. First capture group is the element path, second + // (if present) is the attribute. + preg_match_all('|^/([^\[]+)\[?(.*?)]?$|', $configuration['item_selector'], $matches); + $element_path = $matches[1][0]; + $this->elementsToMatch = explode('/', $element_path); + $predicate = $matches[2][0]; + if ($predicate) { + $this->xpathPredicate = $predicate; + } + + // If the element path contains any colons, it must be specifying + // namespaces, so we need to compare using the prefixed element + // name in next(). + if (strpos($element_path, ':')) { + $this->prefixedName = TRUE; + } + + foreach ($this->fieldSelectors() as $field_name => $xpath) { + $prefix = substr($xpath, 0, 3); + if ($prefix === '../') { + $this->parentElementsOfInterest[] = str_replace('../', '', $xpath); + } + elseif ($prefix === '..\\') { + $this->parentElementsOfInterest[] = str_replace('..\\', '', $xpath); + } + } + } + + /** + * Builds a \SimpleXmlElement rooted at the iterator's current location. + * + * The resulting SimpleXmlElement also contains any child nodes of the current + * element. + * + * @return \SimpleXmlElement|false + * A \SimpleXmlElement when the document is parseable, or false if a + * parsing error occurred. + * + * @throws MigrateException + */ + protected function getSimpleXml() { + $node = $this->reader->expand(); + if ($node) { + // We must associate the DOMNode with a DOMDocument to be able to import + // it into SimpleXML. Despite appearances, this is almost twice as fast as + // simplexml_load_string($this->readOuterXML()); + $dom = new \DOMDocument(); + $node = $dom->importNode($node, TRUE); + $dom->appendChild($node); + $sxml_elem = simplexml_import_dom($node); + $this->registerNamespaces($sxml_elem); + return $sxml_elem; + } + else { + foreach (libxml_get_errors() as $error) { + $error_string = self::parseLibXmlError($error); + throw new MigrateException($error_string); + } + return FALSE; + } + } + + /** + * {@inheritdoc} + */ + public function rewind() { + // Reset our path tracker. + $this->currentPath = []; + parent::rewind(); + } + + /** + * {@inheritdoc} + */ + protected function openSourceUrl($url) { + // (Re)open the provided URL. + $this->reader->close(); + return $this->reader->open($url, NULL, \LIBXML_NOWARNING); + } + + /** + * {@inheritdoc} + */ + protected function fetchNextRow() { + $target_element = NULL; + + // Loop over each node in the XML file, looking for elements at a path + // matching the input query string (represented in $this->elementsToMatch). + while ($this->reader->read()) { + if ($this->reader->nodeType == \XMLReader::ELEMENT) { + if ($this->prefixedName) { + $this->currentPath[$this->reader->depth] = $this->reader->name; + if (in_array($this->reader->name, $this->parentElementsOfInterest)) { + $this->parentXpathCache[$this->reader->depth][$this->reader->name][] = $this->getSimpleXml(); + } + } + else { + $this->currentPath[$this->reader->depth] = $this->reader->localName; + if (in_array($this->reader->localName, $this->parentElementsOfInterest)) { + $this->parentXpathCache[$this->reader->depth][$this->reader->name][] = $this->getSimpleXml(); + } + } + if ($this->currentPath == $this->elementsToMatch) { + // We're positioned to the right element path - build the SimpleXML + // object to enable proper xpath predicate evaluation. + $target_element = $this->getSimpleXml(); + if ($target_element !== FALSE) { + if (empty($this->xpathPredicate) || $this->predicateMatches($target_element)) { + break; + } + } + } + } + elseif ($this->reader->nodeType == \XMLReader::END_ELEMENT) { + // Remove this element and any deeper ones from the current path. + foreach ($this->currentPath as $depth => $name) { + if ($depth >= $this->reader->depth) { + unset($this->currentPath[$depth]); + } + } + foreach ($this->parentXpathCache as $depth => $elements) { + if ($depth > $this->reader->depth) { + unset($this->parentXpathCache[$depth]); + } + } + } + } + + // If we've found the desired element, populate the currentItem and + // currentId with its data. + if ($target_element !== FALSE && !is_null($target_element)) { + foreach ($this->fieldSelectors() as $field_name => $xpath) { + $prefix = substr($xpath, 0, 3); + if (in_array($prefix, ['../', '..\\'])) { + $name = str_replace($prefix, '', $xpath); + $up = substr_count($xpath, $prefix); + $values = $this->getAncestorElements($up, $name); + } + else { + $values = $target_element->xpath($xpath); + } + foreach ($values as $value) { + // If the SimpleXMLElement doesn't render to a string of any sort, + // and has children then return the whole object for the process + // plugin or other row manipulation. + if ($value->children() && !trim((string) $value)) { + $this->currentItem[$field_name] = $value; + } + else { + $this->currentItem[$field_name][] = (string) $value; + } + } + } + // Reduce single-value results to scalars. + foreach ($this->currentItem as $field_name => $values) { + if (count($values) == 1) { + $this->currentItem[$field_name] = reset($values); + } + } + } + } + + /** + * Tests whether the iterator's xpath predicate matches the provided element. + * + * Has some limitations esp. in that it is easy to write predicates that + * reference things outside this SimpleXmlElement's tree, but "simpler" + * predicates should work as expected. + * + * @param \SimpleXMLElement $elem + * The element to test. + * + * @return bool + * True if the element matches the predicate, false if not. + */ + protected function predicateMatches(\SimpleXMLElement $elem) { + return !empty($elem->xpath('/*[' . $this->xpathPredicate . ']')); + } + + /** + * Gets an ancestor SimpleXMLElement, if the element name was registered. + * + * Gets the SimpleXMLElement some number of levels above the iterator + * having the given name, but only for element names that this + * Xml data parser was told to retain for future reference through the + * constructor's $parent_elements_of_interest. + * + * @param int $levels_up + * The number of levels back towards the root of the DOM tree to ascend + * before searching for the named element. + * @param string $name + * The name of the desired element. + * + * @return \SimpleXMLElement|false + * The element matching the level and name requirements, or false if it is + * not present or was not retained. + */ + public function getAncestorElements($levels_up, $name) { + if ($levels_up > 0) { + $levels_up *= -1; + } + $ancestor_depth = $this->reader->depth + $levels_up + 1; + if ($ancestor_depth < 0) { + return FALSE; + } + + if (array_key_exists($ancestor_depth, $this->parentXpathCache) && array_key_exists($name, $this->parentXpathCache[$ancestor_depth])) { + return $this->parentXpathCache[$ancestor_depth][$name]; + } + else { + return FALSE; + } + } + +}