3 namespace Drupal\migrate_plus\Plugin\migrate_plus\data_parser;
5 use Drupal\migrate\MigrateException;
6 use Drupal\migrate_plus\DataParserPluginBase;
9 * Obtain XML data for migration using the XMLReader pull parser.
13 * title = @Translation("XML")
16 class Xml extends DataParserPluginBase {
21 * The XMLReader we are encapsulating.
28 * Array of the element names from the query.
30 * 0-based from the first (root) element. For example, '//file/article' would
31 * be stored as [0 => 'file', 1 => 'article'].
35 protected $elementsToMatch = [];
38 * An optional xpath predicate.
40 * Restricts the matching elements based on values in their children. Parsed
41 * from the element query at construct time.
45 protected $xpathPredicate = NULL;
48 * Array representing the path to the current element as we traverse the XML.
50 * For example, if in an XML string like '<file><article>...</article></file>'
51 * we are positioned within the article element, currentPath will be
52 * [0 => 'file', 1 => 'article'].
56 protected $currentPath = [];
59 * Retains all elements with a given name to support extraction from parents.
61 * This is a hack to support field extraction of values in parents
62 * of the 'context node' - ie, if $this->fields() has something like '..\nid'.
63 * Since we are using a streaming xml processor, it is too late to snoop
64 * around parent elements again once we've located an element of interest. So,
65 * grab elements with matching names and their depths, and refer back to it
66 * when building the source row.
70 protected $parentXpathCache = [];
73 * Hash of the element names that should be captured into $parentXpathCache.
77 protected $parentElementsOfInterest = [];
80 * Element name matching mode.
82 * When matching element names, whether to compare to the namespace-prefixed
83 * name, or the local name.
87 protected $prefixedName = FALSE;
92 public function __construct(array $configuration, $plugin_id, $plugin_definition) {
93 parent::__construct($configuration, $plugin_id, $plugin_definition);
95 $this->reader = new \XMLReader();
97 // Suppress errors during parsing, so we can pick them up after.
98 libxml_use_internal_errors(TRUE);
100 // Parse the element query. First capture group is the element path, second
101 // (if present) is the attribute.
102 preg_match_all('|^/([^\[]+)\[?(.*?)]?$|', $configuration['item_selector'], $matches);
103 $element_path = $matches[1][0];
104 $this->elementsToMatch = explode('/', $element_path);
105 $predicate = $matches[2][0];
107 $this->xpathPredicate = $predicate;
110 // If the element path contains any colons, it must be specifying
111 // namespaces, so we need to compare using the prefixed element
113 if (strpos($element_path, ':')) {
114 $this->prefixedName = TRUE;
117 foreach ($this->fieldSelectors() as $field_name => $xpath) {
118 $prefix = substr($xpath, 0, 3);
119 if ($prefix === '../') {
120 $this->parentElementsOfInterest[] = str_replace('../', '', $xpath);
122 elseif ($prefix === '..\\') {
123 $this->parentElementsOfInterest[] = str_replace('..\\', '', $xpath);
129 * Builds a \SimpleXmlElement rooted at the iterator's current location.
131 * The resulting SimpleXmlElement also contains any child nodes of the current
134 * @return \SimpleXmlElement|false
135 * A \SimpleXmlElement when the document is parseable, or false if a
136 * parsing error occurred.
138 * @throws MigrateException
140 protected function getSimpleXml() {
141 $node = $this->reader->expand();
143 // We must associate the DOMNode with a DOMDocument to be able to import
144 // it into SimpleXML. Despite appearances, this is almost twice as fast as
145 // simplexml_load_string($this->readOuterXML());
146 $dom = new \DOMDocument();
147 $node = $dom->importNode($node, TRUE);
148 $dom->appendChild($node);
149 $sxml_elem = simplexml_import_dom($node);
150 $this->registerNamespaces($sxml_elem);
154 foreach (libxml_get_errors() as $error) {
155 $error_string = self::parseLibXmlError($error);
156 throw new MigrateException($error_string);
165 public function rewind() {
166 // Reset our path tracker.
167 $this->currentPath = [];
174 protected function openSourceUrl($url) {
175 // (Re)open the provided URL.
176 $this->reader->close();
177 return $this->reader->open($url, NULL, \LIBXML_NOWARNING);
183 protected function fetchNextRow() {
184 $target_element = NULL;
186 // Loop over each node in the XML file, looking for elements at a path
187 // matching the input query string (represented in $this->elementsToMatch).
188 while ($this->reader->read()) {
189 if ($this->reader->nodeType == \XMLReader::ELEMENT) {
190 if ($this->prefixedName) {
191 $this->currentPath[$this->reader->depth] = $this->reader->name;
192 if (in_array($this->reader->name, $this->parentElementsOfInterest)) {
193 $this->parentXpathCache[$this->reader->depth][$this->reader->name][] = $this->getSimpleXml();
197 $this->currentPath[$this->reader->depth] = $this->reader->localName;
198 if (in_array($this->reader->localName, $this->parentElementsOfInterest)) {
199 $this->parentXpathCache[$this->reader->depth][$this->reader->name][] = $this->getSimpleXml();
202 if ($this->currentPath == $this->elementsToMatch) {
203 // We're positioned to the right element path - build the SimpleXML
204 // object to enable proper xpath predicate evaluation.
205 $target_element = $this->getSimpleXml();
206 if ($target_element !== FALSE) {
207 if (empty($this->xpathPredicate) || $this->predicateMatches($target_element)) {
213 elseif ($this->reader->nodeType == \XMLReader::END_ELEMENT) {
214 // Remove this element and any deeper ones from the current path.
215 foreach ($this->currentPath as $depth => $name) {
216 if ($depth >= $this->reader->depth) {
217 unset($this->currentPath[$depth]);
220 foreach ($this->parentXpathCache as $depth => $elements) {
221 if ($depth > $this->reader->depth) {
222 unset($this->parentXpathCache[$depth]);
228 // If we've found the desired element, populate the currentItem and
229 // currentId with its data.
230 if ($target_element !== FALSE && !is_null($target_element)) {
231 foreach ($this->fieldSelectors() as $field_name => $xpath) {
232 $prefix = substr($xpath, 0, 3);
233 if (in_array($prefix, ['../', '..\\'])) {
234 $name = str_replace($prefix, '', $xpath);
235 $up = substr_count($xpath, $prefix);
236 $values = $this->getAncestorElements($up, $name);
239 $values = $target_element->xpath($xpath);
241 foreach ($values as $value) {
242 // If the SimpleXMLElement doesn't render to a string of any sort,
243 // and has children then return the whole object for the process
244 // plugin or other row manipulation.
245 if ($value->children() && !trim((string) $value)) {
246 $this->currentItem[$field_name] = $value;
249 $this->currentItem[$field_name][] = (string) $value;
253 // Reduce single-value results to scalars.
254 foreach ($this->currentItem as $field_name => $values) {
255 if (count($values) == 1) {
256 $this->currentItem[$field_name] = reset($values);
263 * Tests whether the iterator's xpath predicate matches the provided element.
265 * Has some limitations esp. in that it is easy to write predicates that
266 * reference things outside this SimpleXmlElement's tree, but "simpler"
267 * predicates should work as expected.
269 * @param \SimpleXMLElement $elem
270 * The element to test.
273 * True if the element matches the predicate, false if not.
275 protected function predicateMatches(\SimpleXMLElement $elem) {
276 return !empty($elem->xpath('/*[' . $this->xpathPredicate . ']'));
280 * Gets an ancestor SimpleXMLElement, if the element name was registered.
282 * Gets the SimpleXMLElement some number of levels above the iterator
283 * having the given name, but only for element names that this
284 * Xml data parser was told to retain for future reference through the
285 * constructor's $parent_elements_of_interest.
287 * @param int $levels_up
288 * The number of levels back towards the root of the DOM tree to ascend
289 * before searching for the named element.
290 * @param string $name
291 * The name of the desired element.
293 * @return \SimpleXMLElement|false
294 * The element matching the level and name requirements, or false if it is
295 * not present or was not retained.
297 public function getAncestorElements($levels_up, $name) {
298 if ($levels_up > 0) {
301 $ancestor_depth = $this->reader->depth + $levels_up + 1;
302 if ($ancestor_depth < 0) {
306 if (array_key_exists($ancestor_depth, $this->parentXpathCache) && array_key_exists($name, $this->parentXpathCache[$ancestor_depth])) {
307 return $this->parentXpathCache[$ancestor_depth][$name];