<?php
/**
* XML to Associative Array Class
*
* Usage:
* $domObj = new xmlToArrayParser($xml);
* $domArr = $domObj->array;
*
* if($domObj->parse_error) echo $domObj->get_xml_error();
* else print_r($domArr);
*
* On Success:
* eg. $domArr['top']['element2']['attrib']['var2'] => val2
*
* On Error:
* eg. Error Code [76] "Mismatched tag", at char 58 on line 3
*/
/**
* Convert an xml file or string to an associative array (including the tag attributes):
* $domObj = new xmlToArrayParser($xml);
* $elemVal = $domObj->array['element']
* Or: $domArr=$domObj->array; $elemVal = $domArr['element'].
*
* @version 2.0
* @param Str $xml file/string.
*/
class xmlToArrayParser {
/** The array created by the parser can be assigned to any variable: $anyVarArr = $domObj->array.*/
public $array = array();
public $parse_error = false;
private $parser;
private $pointer;
/** Constructor: $domObj = new xmlToArrayParser($xml); */
public function __construct($xml) {
$this->pointer =& $this->array;
$this->parser = xml_parser_create("UTF-8");
xml_set_object($this->parser, $this);
xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, false);
xml_set_element_handler($this->parser, "tag_open", "tag_close");
xml_set_character_data_handler($this->parser, "cdata");
$this->parse_error = xml_parse($this->parser, ltrim($xml))? false : true;
}
/** Free the parser. */
public function __destruct() { xml_parser_free($this->parser);}
/** Get the xml error if an an error in the xml file occured during parsing. */
public function get_xml_error() {
if($this->parse_error) {
$errCode = xml_get_error_code ($this->parser);
$thisError = "Error Code [". $errCode ."] \"<strong style='color:red;'>" . xml_error_string($errCode)."</strong>\",
at char ".xml_get_current_column_number($this->parser) . "
on line ".xml_get_current_line_number($this->parser)."";
}else $thisError = $this->parse_error;
return $thisError;
}
private function tag_open($parser, $tag, $attributes) {
$this->convert_to_array($tag, 'attrib');
$idx=$this->convert_to_array($tag, 'cdata');
if(isset($idx)) {
$this->pointer[$tag][$idx] = Array('@idx' => $idx,'@parent' => &$this->pointer);
$this->pointer =& $this->pointer[$tag][$idx];
}else {
$this->pointer[$tag] = Array('@parent' => &$this->pointer);
$this->pointer =& $this->pointer[$tag];
}
if (!empty($attributes)) { $this->pointer['attrib'] = $attributes; }
}
/** Adds the current elements content to the current pointer[cdata] array. */
private function cdata($parser, $cdata) { $this->pointer['cdata'] = trim($cdata); }
private function tag_close($parser, $tag) {
$current = & $this->pointer;
if(isset($this->pointer['@idx'])) {unset($current['@idx']);}
$this->pointer = & $this->pointer['@parent'];
unset($current['@parent']);
if(isset($current['cdata']) && count($current) == 1) { $current = $current['cdata'];}
else if(empty($current['cdata'])) {unset($current['cdata']);}
}
/** Converts a single element item into array(element[0]) if a second element of the same name is encountered. */
private function convert_to_array($tag, $item) {
if(isset($this->pointer[$tag][$item])) {
$content = $this->pointer[$tag];
$this->pointer[$tag] = array((0) => $content);
$idx = 1;
}else if (isset($this->pointer[$tag])) {
$idx = count($this->pointer[$tag]);
if(!isset($this->pointer[$tag][0])) {
foreach ($this->pointer[$tag] as $key => $value) {
unset($this->pointer[$tag][$key]);
$this->pointer[$tag][0][$key] = $value;
}}}else $idx = null;
return $idx;
}
}
?>
This is supplimental information for the "class xmlToArrayParser".
This is a fully functional error free, extensively tested php class unlike the posts that follow it.
Key phrase: Fully functional, fully tested, error free XML To Array parser.
<?php
/**
* class xmlToArrayParser
*
Notes:
1. 'attrib' and 'cdata' are keys added to the array when the element contains both attributes and content.
2. Ignores content that is not in between it's own set of tags.
3. Don't know if it recognizes processing instructions nor do I know about processing instructions.
<\?some_pi some_attr="some_value"?> This is the same as a document declaration.
4. Empty elements are not included unless they have attributes.
5. Version 2.0, Dec. 2, 2011, added xml error reporting.
Usage:
$domObj = new xmlToArrayParser($xml);
$elemVal = $domObj->array['element']
Or assign the entire array to its own variable:
$domArr = $domObj->array;
$elemVal = $domArr['element']
Example:
$xml = '<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<top>
<element1>element content 1</element1>
<element2 var2="val2" />
<element3 var3="val3" var4="val4">element content 3</element3>
<element3 var5="val5">element content 4</element3>
<element3 var6="val6" />
<element3>element content 7</element3>
</top>';
$domObj = new xmlToArrayParser($xml);
$domArr = $domObj->array;
if($domObj->parse_error) echo $domObj->get_xml_error();
else print_r($domArr);
On Success:
$domArr['top']['element1'] => element content 1
$domArr['top']['element2']['attrib']['var2'] => val2
$domArr['top']['element3']['0']['attrib']['var3'] => val3
$domArr['top']['element3']['0']['attrib']['var4'] => val4
$domArr['top']['element3']['0']['cdata'] => element content 3
$domArr['top']['element3']['1']['attrib']['var5'] => val5
$domArr['top']['element3']['1']['cdata'] => element content 4
$domArr['top']['element3']['2']['attrib']['var6'] => val6
$domArr['top']['element3']['3'] => element content 7
On Error:
Error Code [76] "Mismatched tag", at char 58 on line 3
*
*/
?>
CLXXI. XML, Analisadores
Introdução
XML (eXtensible Markup Language) é um formato de dados para intercâmbio de documentos na Web. Ele é um padrão definido pela The World Wide Web consortium (W3C). Informações sobre XML e tecnologias relacionadas pode, ser encontradas em http://www.w3.org/XML/.
Esta extensão do PHP implementa suporte para o James Clark's expat no PHP. Esta ferramenta permite que você analise, mas não valide, documentos XML. Ele suporta três source character encodings também fornecido pelo PHP: US-ASCII, ISO-8859-1 e UTF-8. UTF-16 não é suportado.
Esta extensão te permite criar analisadores XML e então definir manipuladores (handlers) para diferentes eventos XML. Cada analisador XML também tem alguns parâmetros que você pode ajustar.
Dependências
Esta extensão utiliza o expat, que pode ser encontrado em http://www.jclark.com/xml/expat.html. O makefile que vem com o expat, por definição não constrói uma biblioteca, você pode usar as regras de 'make' para isso:
libexpat.a: $(OBJS)
ar -rc $@ $(OBJS)
ranlib $@ |
Instalação
Estas funções estão por definição habilitadas, usando o pacote expat library. Você pode desabilitar o suporte a XML com --disable-xml. Se você compilou o PHP como um módulo do Apache 1.3.9 ou mais novo, o PHP automaticamente utilizará o pacote expat library do Apache. Regularmente você não quer utilizar a configuração do pacote expat library do PHP --with-expat-dir=DIR, onde DIR apontaria para o diretório base de instalação do expat.
A versão para Windows do PHP tem suporte embutido para esta extensão. Você não precisa carregar nenhuma extensão adicional para utilizar essas funções.
Configurações em execução
Esta extensão não define nenhum parâmetro de configuração no php.ini.
Tipos Resource
xml
O recurso xml enquanto retornado por xml_parser_create() e xml_parser_create_ns() refere-se a uma instância do analisador xml para ser usada coms as funções fornecidas por esta extensão.
Constantes pré-definidas
As contantes abaixo são definidas por esta extensão e somente estarão disponíveis quando a extensão foi compilada com o PHP ou carregada dinamicamente durante a execução.
- XML_ERROR_NONE (integer)
- XML_ERROR_NO_MEMORY (integer)
- XML_ERROR_SYNTAX (integer)
- XML_ERROR_NO_ELEMENTS (integer)
- XML_ERROR_INVALID_TOKEN (integer)
- XML_ERROR_UNCLOSED_TOKEN (integer)
- XML_ERROR_PARTIAL_CHAR (integer)
- XML_ERROR_TAG_MISMATCH (integer)
- XML_ERROR_DUPLICATE_ATTRIBUTE (integer)
- XML_ERROR_JUNK_AFTER_DOC_ELEMENT (integer)
- XML_ERROR_PARAM_ENTITY_REF (integer)
- XML_ERROR_UNDEFINED_ENTITY (integer)
- XML_ERROR_RECURSIVE_ENTITY_REF (integer)
- XML_ERROR_ASYNC_ENTITY (integer)
- XML_ERROR_BAD_CHAR_REF (integer)
- XML_ERROR_BINARY_ENTITY_REF (integer)
- XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF (integer)
- XML_ERROR_MISPLACED_XML_PI (integer)
- XML_ERROR_UNKNOWN_ENCODING (integer)
- XML_ERROR_INCORRECT_ENCODING (integer)
- XML_ERROR_UNCLOSED_CDATA_SECTION (integer)
- XML_ERROR_EXTERNAL_ENTITY_HANDLING (integer)
- XML_OPTION_CASE_FOLDING (integer)
- XML_OPTION_TARGET_ENCODING (integer)
- XML_OPTION_SKIP_TAGSTART (integer)
- XML_OPTION_SKIP_WHITE (integer)
Manipuladores (handlers) de Evento
Os manipuladores (handlers) de eventos de XML definidos são:
Tabela 1. Manipuladores XML Suportados
| função do PHP para definir o manipulador(handler) | Descrição do evento |
|---|---|
| xml_set_element_handler() | Eventos elementares são emitidos toda vez que o analisador XML o início ou o fim das tags. Há manipuladores (handlers) separadores para tags de início e tags de fim. |
| xml_set_character_data_handler() | Dados de caractere são aproximadamente todo o conteúdo de documentos XML, incluindo espaços em branco entre as tags. Note que o analisador XML não adiciona ou remove qualquer espaço em branco, ele está pronto para a aplicação se você decidir que espaços em branco são significativos. |
| xml_set_processing_instruction_handler() | Programadores de PHP já estariam familiarizados com instruções de processo (PIs). <?php ?> é uma instrução de processo, onde php é chamada de o "PI target". O tratamento destes são application-specific, exceto que todos os PI targets iniciados com "XML" estão reservados. |
| xml_set_default_handler() | O que não vai para outro manipulador vai para o manipulador padrão. Você conseguirá coisas como o XML e declarações do tipo de documento no manipulador padrão. |
| xml_set_unparsed_entity_decl_handler() | Este manipulador será chamado por uma declaração de um entity não analisada (NDATA). |
| xml_set_notation_decl_handler() | Este manipulador é chamado pela declaração de uma nota. |
| xml_set_external_entity_ref_handler() | Este manipulador é chamado quando o analisador XML encontra uma referência para uma entity geral analizada externamente. Isto pode ser uma referência para um arquivo ou URL, por examplo. Veja Um exemplo de entity externa para uma demonstração. |
Case Folding
As funções de elementos do manipulador podem conseguir os nomes dos elementos case-folded. Case-folding é definida pelo padrão XML como "um processo aplicado para uma sequência de caracateres, em que aqueles identificados como minúsculos são substituidos pelos seus maiúsculos equivalentes". Em outras palavras, quando vai pro XML, case-folding simplesmente significa mudar pra maiúsculas.
Por definição, todos os nomes de elementos que são passados para as funções de manipulador são case-folded. Este comportamento pode ser perguntado e controlado pelo analisador XML com o xml_parser_get_option() e as funções xml_parser_set_option(), respectivamente.
Error Codes
As seguintes constantes são definidas para erros no código XML (conforme retornado por xml_parse()):
| XML_ERROR_NONE |
| XML_ERROR_NO_MEMORY |
| XML_ERROR_SYNTAX |
| XML_ERROR_NO_ELEMENTS |
| XML_ERROR_INVALID_TOKEN |
| XML_ERROR_UNCLOSED_TOKEN |
| XML_ERROR_PARTIAL_CHAR |
| XML_ERROR_TAG_MISMATCH |
| XML_ERROR_DUPLICATE_ATTRIBUTE |
| XML_ERROR_JUNK_AFTER_DOC_ELEMENT |
| XML_ERROR_PARAM_ENTITY_REF |
| XML_ERROR_UNDEFINED_ENTITY |
| XML_ERROR_RECURSIVE_ENTITY_REF |
| XML_ERROR_ASYNC_ENTITY |
| XML_ERROR_BAD_CHAR_REF |
| XML_ERROR_BINARY_ENTITY_REF |
| XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF |
| XML_ERROR_MISPLACED_XML_PI |
| XML_ERROR_UNKNOWN_ENCODING |
| XML_ERROR_INCORRECT_ENCODING |
| XML_ERROR_UNCLOSED_CDATA_SECTION |
| XML_ERROR_EXTERNAL_ENTITY_HANDLING |
Codificação de Caracter
A Extensão XML do PHP suporta o caracter Unicode definido por character encodings (codificação de caracteres) diferentes. Há dois tipos de codificação de caracteres, source encoding e target encoding. A apresentação interna do PHP do documento é sempre codificada com UTF-8.
Source encoding é feita quando um documento XML é analisado. Em criando um analisador XML, um source encoding pode ser especificado (Esta codificação não poderá ser mudada após o tempo de vida do analisador XML). As codificações suportadas são ISO-8859-1, US-ASCII e UTF-8. O segundo são codificações single-byte, que significa que cada caractere é representado por um byte simples. UTF-8 pode codificar caracteres compostos por um número de bits variável (acima de 21) em um de seus 4 bytes. O source encoding padrão utilizado pelo PHP é ISO-8859-1.
Target encoding é feito quando o PHP passa dados para as funções do analisador XML. Quando um analisador XML é criado, o target encoding é definido igual ao source encoding, mas este pode ser mudado em qualquer ponto. O target encoding afetará dados de caracter tão bem como nome de tags e processando alvos da instrução.
Se o analisador XML encontra caracteres de fora da linha que seu source encoding é capaz de detalhar, ele retornará um erro.
Se PHP encontra caracteres no documento XML analisado que não podem ser detalhados selecionados com target encoding, os caracteres com problema serão "demoted". Atualmente, isto significa que tais caracteres serão substituidos por um sinal de interrogação.
Exemplos
Aqui estão alguns exemplos de scripts PHP analisando documentos XML.
Exemplo estruturado de elementos XML
Este primeiro exemplo mostra a estrutura de elementos iniciais num documento com distanciamento da margem.
Exemplo arrumação de Tag XML
Exemplo 2. Arrumar XML to HTML Este exemplo arruma tags do documento XML diretamente para tags HTML. Elementos não encontrados no "array de arrumação" são ignorados. É claro, este exemplo trabalhará apenas com um tipo de documento XML específico.
|
Exemplo XML de Entity externa
Este exemplo destaca o código XML. Ele ilusta como utilizar um analisador de referência a uma entity externa para incluir e analisar outros documentos, tão bem como PIs (instruções de processo) podem ser processadas, e uma forma de determinar "trust" para PIs contendo códigos.
Documentos XML que podem ser utilizados para este exemplo são encontrados abaixo do exemplo (xmltest.xml e xmltest2.xml.)
Exemplo 3. Exemplo de Entity externa
|
Exemplo 4. xmltest.xml
|
Este arquivo está incluido de xmltest.xml:
- Índice
- utf8_decode -- Converte uma string com caracteres ISO-8859-1 codificadas com UTF-8 para single-byte ISO-8859-1.
- utf8_encode -- Codifica um string ISO-8859-1 para UTF-8
- xml_error_string -- Obtém uma string de erro do analisador XML
- xml_get_current_byte_index -- Obtém o índice do byte atual para um analisador XML
- xml_get_current_column_number -- Obtém o número da coluna atual para um analisador XML
- xml_get_current_line_number -- Obtém o número da linha para um analisador XML
- xml_get_error_code -- Obtém um código de erro do analisador XML
- xml_parse_into_struct -- Analisa dados XML dentro de uma estrutura de array
- xml_parse -- Inicia a análise em um documento XML
- xml_parser_create_ns -- Cria um analisador XML com suporte a namespace (uma estrutura do XML)
- xml_parser_create -- cria um analisador XML
- xml_parser_free -- Free an XML parser
- xml_parser_get_option -- Get options from an XML parser
- xml_parser_set_option -- Set options in an XML parser
- xml_set_character_data_handler -- Set up character data handler
- xml_set_default_handler -- Set up default handler
- xml_set_element_handler -- Set up start and end element handlers
- xml_set_end_namespace_decl_handler -- Set up end namespace declaration handler
- xml_set_external_entity_ref_handler -- Set up external entity reference handler
- xml_set_notation_decl_handler -- Set up notation declaration handler
- xml_set_object -- Use XML Parser within an object
- xml_set_processing_instruction_handler -- Set up processing instruction (PI) handler
- xml_set_start_namespace_decl_handler -- Set up start namespace declaration handler
- xml_set_unparsed_entity_decl_handler -- Set up unparsed entity declaration handler
Finally a simple xml => array class.
Functioning like SimpleXML library.
<?php
class xml {
private $parser;
private $pointer;
public $dom;
public function __construct($data) {
$this->pointer =& $this->dom;
$this->parser = xml_parser_create();
xml_set_object($this->parser, $this);
xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, false);
xml_set_element_handler($this->parser, "tag_open", "tag_close");
xml_set_character_data_handler($this->parser, "cdata");
xml_parse($this->parser, $data);
}
private function tag_open($parser, $tag, $attributes) {
if (isset($this->pointer[$tag]['@attributes'])) {
$content = $this->pointer[$tag];
$this->pointer[$tag] = array(0 => $content);
$idx = 1;
} else if (isset($this->pointer[$tag]))
$idx = count($this->pointer[$tag]);
if (isset($idx)) {
$this->pointer[$tag][$idx] = Array(
'@idx' => $idx,
'@parent' => &$this->pointer);
$this->pointer =& $this->pointer[$tag][$idx];
} else {
$this->pointer[$tag] = Array(
'@parent' => &$this->pointer);
$this->pointer =& $this->pointer[$tag];
}
if (!empty($attributes))
$this->pointer['@attributes'] = $attributes;
}
private function cdata($parser, $cdata) {
$this->pointer['@data'] = $cdata;
}
private function tag_close($parser, $tag) {
$current = & $this->pointer;
if (isset($this->pointer['@idx']))
unset($current['@idx']);
$this->pointer = & $this->pointer['@parent'];
unset($current['@parent']);
if (isset($current['@data']) && count($current) == 1)
$current = $current['@data'];
else if (empty($current['@data'])||$current['@data']==0)
unset($current['@data']);
}
}
?>
maybe I'll do some explanations on habr
Reading xml into a class:
<?PHP
class XmlData {}
$elements = array();
$elements[] =& new XmlData();
function startElement($parser, $name, $attrs) {
global $elements;
$element =& new XMLData();
$elements[count($elements)-1]->$name =& $element;
$elements[] =& $element;
}
function endElement($parser, $name) {
global $elements;
array_pop($elements);
}
function characterData($parser, $data) {
global $elements;
$elements[count($elements)-1]->data = $data;
}
$xml_parser = xml_parser_create();
xml_set_element_handler($xml_parser, "startElement", "endElement");
xml_set_character_data_handler($xml_parser, "characterData");
xml_parse($xml_parser, $xml, true);
xml_parser_free($xml_parser);
$request =& array_pop($elements);
echo $request->LOGIN->USER->data;
?>
I wanted to access the ISBN database, and was previously parsing the HTML string generated from their main page, that is until I discovered they have an API that returns XML.
So, if anyone wants to get some information from the ISBN database, all you need to do is the following.
<?php
//Search the ISBN database for the book.
$url = "http://www.isbndb.com/api/books.xml? access_key=KEY&index1=isbn&value1=$_GET[ISBN]";
$p = xml_parser_create();
xml_parse_into_struct($p,file_get_contents($url),$results,$index);
xml_parser_free($p);
$title = $results[$index[TITLELONG][0]][value];
$author = $results[$index[AUTHORSTEXT][0]][value];
$publisher = $results[$index[PUBLISHERTEXT][0]][value];
?>
You will need to get an access key from isbndb.com, but it takes two seconds and is free. When you get it, replace KEY in the URL with your own key. Also, my code above will search for the book that fits the ISBN number stored in the GET variable ISBN - you can search by other parameters and return more than one result, but my example is for a simple ISBN search.
The problem I had was I needed to generate xml on the screen for users to actually see and copy to a file.
I'm generating the xml manually from a php file and the browser kept interpreting the xml...not very helpful.
This is how you get around it:
<?php
$file = file_get_contents("http://example.com/xml.php?whatever=$whatever");
print nl2br(htmlentities($file));
?>
Prints all my xml quite nicely.
I needed this for work/personal use. Sometimes you'll have a XML string generated as one long string and no line breaks...nusoap in the case of today/work, but there are any other number of possible things that will generate these. Anyways, this simply takes a long XML string and returns an indented/line-breaked version of the string for display/readability.
<?php
function xmlIndent($str){
$ret = "";
$indent = 0;
$indentInc = 3;
$noIndent = false;
while(($l = strpos($str,"<",$i))!==false){
if($l!=$r && $indent>0){ $ret .= "\n" . str_repeat(" ",$indent) . substr($str,$r,($l-$r)); }
$i = $l+1;
$r = strpos($str,">",$i)+1;
$t = substr($str,$l,($r-$l));
if(strpos($t,"/")==1){
$indent -= $indentInc;
$noIndent = true;
}
else if(($r-$l-strpos($t,"/"))==2 || substr($t,0,2)=="<?"){ $noIndent = true; }
if($indent<0){ $indent = 0; }
if($ret){ $ret .= "\n"; }
$ret .= str_repeat(" ",$indent);
$ret .= $t;
if(!$noIndent){ $indent += $indentInc; }
$noIndent = false;
}
$ret .= "\n";
return($ret);
}
?>
(...this was only tested for what i needed at work, could POSSIBLY need additions)
Time to add my attempt at a very simple script that parses XML into a structure:
<?php
class Simple_Parser
{
var $parser;
var $error_code;
var $error_string;
var $current_line;
var $current_column;
var $data = array();
var $datas = array();
function parse($data)
{
$this->parser = xml_parser_create('UTF-8');
xml_set_object($this->parser, $this);
xml_parser_set_option($this->parser, XML_OPTION_SKIP_WHITE, 1);
xml_set_element_handler($this->parser, 'tag_open', 'tag_close');
xml_set_character_data_handler($this->parser, 'cdata');
if (!xml_parse($this->parser, $data))
{
$this->data = array();
$this->error_code = xml_get_error_code($this->parser);
$this->error_string = xml_error_string($this->error_code);
$this->current_line = xml_get_current_line_number($this->parser);
$this->current_column = xml_get_current_column_number($this->parser);
}
else
{
$this->data = $this->data['child'];
}
xml_parser_free($this->parser);
}
function tag_open($parser, $tag, $attribs)
{
$this->data['child'][$tag][] = array('data' => '', 'attribs' => $attribs, 'child' => array());
$this->datas[] =& $this->data;
$this->data =& $this->data['child'][$tag][count($this->data['child'][$tag])-1];
}
function cdata($parser, $cdata)
{
$this->data['data'] .= $cdata;
}
function tag_close($parser, $tag)
{
$this->data =& $this->datas[count($this->datas)-1];
array_pop($this->datas);
}
}
$xml_parser = new Simple_Parser;
$xml_parser->parse('<foo><bar>test</bar></foo>');
?>
Hi !
After parsing the XML and modifying it, I just add a method to rebuild the XML form the internal structure (xmlp->document).
The method xmlp->toXML writes into xmlp->XML attributes. Then, you just have to output it.
I hope it helps.
<?php
class XMLParser {
var $parser;
var $filePath;
var $document;
var $currTag;
var $tagStack;
var $XML;
var $_tag_to_close = false;
var $TAG_ATTRIBUT = 'attr';
var $TAG_DATA = 'data';
function XMLParser($path) {
$this->parser = xml_parser_create();
$this->filePath = $path;
$this->document = array();
$this->currTag =& $this->document;
$this->tagStack = array();
$this->XML = "";
}
function parse() {
xml_set_object($this->parser, $this);
xml_set_character_data_handler($this->parser, 'dataHandler');
xml_set_element_handler($this->parser, 'startHandler', 'endHandler');
if(!($fp = fopen($this->filePath, "r"))) {
die("Cannot open XML data file: $this->filePath");
return false;
}
while($data = fread($fp, 4096)) {
if(!xml_parse($this->parser, $data, feof($fp))) {
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($this->parser)),
xml_get_current_line_number($this->parser)));
}
}
fclose($fp);
xml_parser_free($this->parser);
return true;
}
function startHandler($parser, $name, $attribs) {
if(!isset($this->currTag[$name]))
$this->currTag[$name] = array();
$newTag = array();
if(!empty($attribs))
$newTag[$this->TAG_ATTRIBUT] = $attribs;
array_push($this->currTag[$name], $newTag);
$t =& $this->currTag[$name];
$this->currTag =& $t[count($t)-1];
array_push($this->tagStack, $name);
}
function dataHandler($parser, $data) {
$data = trim($data);
if(!empty($data)) {
if(isset($this->currTag[$this->TAG_DATA]))
$this->currTag[$this->TAG_DATA] .= $data;
else
$this->currTag[$this->TAG_DATA] = $data;
}
}
function endHandler($parser, $name) {
$this->currTag =& $this->document;
array_pop($this->tagStack);
for($i = 0; $i < count($this->tagStack); $i++) {
$t =& $this->currTag[$this->tagStack[$i]];
$this->currTag =& $t[count($t)-1];
}
}
function clearOutput () {
$this->XML = "";
}
function openTag ($tag) {
$this->XML.="<".strtolower ($tag);
$this->_tag_to_close = true;
}
function closeTag () {
if ($this->_tag_to_close) {
$this->XML.=">";
$this->_tag_to_close = false;
}
}
function closingTag ($tag) {
$this->XML.="</".strtolower ($tag).">";
}
function output_attributes ($contenu_fils) {
foreach ($contenu_fils[$this->TAG_ATTRIBUT] as $nomAttribut => $valeur) {
$this->XML.= " ".strtolower($nomAttribut)."=\"".$valeur."\"";
}
}
function addData ($texte) {
// to be completed
$ca = array ("é", "è", "ê", "à");
$par = array ("é", "è", "ê", "agrave;");
return htmlspecialchars(str_replace ($ca, $par, $texte), ENT_NOQUOTES);
}
function toXML ($tags="") {
if ($tags=="") {
$tags = $this->document;
$this->clearOutput ();
}
foreach ($tags as $tag => $contenu) {
$this->process ($tag, $contenu);
}
}
function process ($tag, $contenu) {
// Pour tous les TAGs
foreach ($contenu as $indice => $contenu_fils) {
$this->openTag ($tag);
// Pour tous les fils (non attribut et non data)
foreach ($contenu_fils as $tagFils => $fils) {
switch ($tagFils) {
case $this->TAG_ATTRIBUT:
$this->output_attributes ($contenu_fils);
$this->closeTag ();
break;
case $this->TAG_DATA:
$this->closeTag ();
$this->XML.= $this->addData ($contenu_fils [$this->TAG_DATA]);
break;
default:
$this->closeTag ();
$this->process ($tagFils, $fils);
break;
}
}
$this->closingTag ($tag);
}
}
}
?>
Here's code that will create an associative array from an xml file. Keys are the tag data and subarrays are formed from attributes and child tags
<?php
$p =& new xmlParser();
$p->parse('/*xml file*/');
print_r($p->output);
?>
<?php
class xmlParser{
var $xml_obj = null;
var $output = array();
var $attrs;
function xmlParser(){
$this->xml_obj = xml_parser_create();
xml_set_object($this->xml_obj,$this);
xml_set_character_data_handler($this->xml_obj, 'dataHandler');
xml_set_element_handler($this->xml_obj, "startHandler", "endHandler");
}
function parse($path){
if (!($fp = fopen($path, "r"))) {
die("Cannot open XML data file: $path");
return false;
}
while ($data = fread($fp, 4096)) {
if (!xml_parse($this->xml_obj, $data, feof($fp))) {
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($this->xml_obj)),
xml_get_current_line_number($this->xml_obj)));
xml_parser_free($this->xml_obj);
}
}
return true;
}
function startHandler($parser, $name, $attribs){
$_content = array();
if(!empty($attribs))
$_content['attrs'] = $attribs;
array_push($this->output, $_content);
}
function dataHandler($parser, $data){
if(!empty($data) && $data!="\n") {
$_output_idx = count($this->output) - 1;
$this->output[$_output_idx]['content'] .= $data;
}
}
function endHandler($parser, $name){
if(count($this->output) > 1) {
$_data = array_pop($this->output);
$_output_idx = count($this->output) - 1;
$add = array();
if ($_data['attrs'])
$add['attrs'] = $_data['attrs'];
if ($_data['child'])
$add['child'] = $_data['child'];
$this->output[$_output_idx]['child'][$_data['content']] = $add;
}
}
}
?>
If you need utf8_encode support and configure PHP with --disable-all you will have some trouble. Unfortunately the configure options aren't completely documented. If you need utf8 functions and have everything disabled just recompile PHP with --enable-xml and you should be good to go.
The documentation regarding white space was never complete I think.
The XML_OPTION_SKIP_WHITE doesn't appear to do anything. I want to preserve the newlines in a cdata section. Setting XML_OPTION_SKIP_WHITE to 0 or false doesn't appear to help. My character_data_handler is getting called once for each line. This obviously should be reflected in the documentation as well. When/how often does the handler get called exactly? Having to build separate test cases is very time consuming.
Inserting newlines myself in my cdata handler is no good either. For non actual CDATA sections that cause my handler to get called, long lines are split up in multiple calls. My handler would not be able to tell the difference whether or not the subsequent calls would be due to the fact that the data is coming from the next line or the fact that some internal buffer is long enough for it to 'flush' out and call the handler.
This behaviour also needs to be properly documented.
I wrote a simple xml parser mainly to deal with rss version 2. I found lots of examples on the net, but they were all masive and bloated and hard to manipulate.
Output is sent to an array, which holds arrays containg data for each item.
Obviously, you will have to make modifications to the code to suit your needs, but there isnt a lot of code there, so that shouldnt be a problem.
<?php
$currentElements = array();
$newsArray = array();
readXml("./news.xml");
echo("<pre>");
print_r($newsArray);
echo("</pre>");
// Reads XML file into formatted html
function readXML($xmlFile)
{
$xmlParser = xml_parser_create();
xml_parser_set_option($xmlParser, XML_OPTION_CASE_FOLDING, false);
xml_set_element_handler($xmlParser, startElement, endElement);
xml_set_character_data_handler($xmlParser, characterData);
$fp = fopen($xmlFile, "r");
while($data = fread($fp, filesize($xmlFile))){
xml_parse($xmlParser, $data, feof($fp));}
xml_parser_free($xmlParser);
}
// Sets the current XML element, and pushes itself onto the element hierarchy
function startElement($parser, $name, $attrs)
{
global $currentElements, $itemCount;
array_push($currentElements, $name);
if($name == "item"){$itemCount += 1;}
}
// Prints XML data; finds highlights and links
function characterData($parser, $data)
{
global $currentElements, $newsArray, $itemCount;
$currentCount = count($currentElements);
$parentElement = $currentElements[$currentCount-2];
$thisElement = $currentElements[$currentCount-1];
if($parentElement == "item"){
$newsArray[$itemCount-1][$thisElement] = $data;}
else{
switch($name){
case "title":
break;
case "link":
break;
case "description":
break;
case "language":
break;
case "item":
break;}}
}
// If the XML element has ended, it is poped off the hierarchy
function endElement($parser, $name)
{
global $currentElements;
$currentCount = count($currentElements);
if($currentElements[$currentCount-1] == $name){
array_pop($currentElements);}
}
?>
use:
<?php
while ($data = str_replace("\n","",fread($fp, 4096))){
// ...
}
?>
instead of:
<?php
while ($data = fread($fp, 4096)) {
// ...
}
?>
[UPDATE 17-FEB-2003: This post] resulted in some of the visitors e-mailg me on the carriage return stripping issue with questions. I'll try to make the following mumble as brief and easy to understand as possible.
1. Overview of the 4096 fragmentation issue
As you know the following freads the file 4096 bytes at a time (that is 4KB) this is perhaps ok for testing expat and figuring out how things work, but it it rather dangerous in the production environment. Data may not be fully understandable due to fread fragmentation and improperly formatted due to numerous sources(formats) of data contained within (i.e. end of line delimited CDATA).
<?php
while ($data = fread($fp, 4096)) {
if (!xml_parse($xml_parser, $data, feof($fp))) {
// ...
}
}
?>
Sometimes to save time one may want to load it all up into a one big variable and leave all the worries to expat. I think anything under 500 KB is ok (as long as nobody knows about it). Some may argue that larger variables are acceptable or even necessary because of the magic that take place while parsing using xml_parse. Our XML parser(expat) works and can be successfully implemented only when we know what type of XML data we are dealing with, it's average size and structure of general layout and data contained within tags. For example if the tags are followed by a line delimiter like a new line we can read it with fgets in and with minimal effort make sure that no data will be sent to the function that does not end with a end tag. But this require a fair knowledge of the file's preference for storing XML data and tags (and a bit of code between reading data and xml_parse'ing it).
It will save you a headache.
2. Pre Parser Strings and New Line Delimited Data
One important thing to note at this point is that the xml_parse function requires a string variable. You can manipulate the content of any string variable easily as we all know.
A better approach to removing newlines than:
<?php
while ($data = fread($fp, 4096)) {
$data = preg_replace("/\n|\r/","",$data); //flarp
if (!xml_parse($xml_parser, $data, feof($fp))) {...
?>
Above works across all 3 line-delimited text files (\n, \r, \r\n). But this could potentially (or will most likely) damage or scramble data contained in for example CDATA areas. As far as I am concerned end of line characters should not be used _within_ XML tags. What seems to be the ultimate solution is to pre-parse the loaded data this would require checking the position within the XML document and adding or subtracting (using a in-between fread temporary variable) data based on conditions like: "Is within tag", "Is within CDATA" etc. before fedding it to the parser. This of course opens up a new can of worms (as in parse data for the parser...). (above procedure would take place between fread and xml_parser calls this method would be compatible with the general usage examples on top of the page)
3. The Answer to parsing arbitrary XML and Preprocessor Revisited
You can't just feed any XML document to the parser you constructed and assuming that it will work! You have to know what kind of methods for storing data are used, for example is there a end of line delimited data in the file ?, Are there any carriage returns in the tags etc... XML files come formatted in different ways some are just a one long string of characters with out any end of line markers others have newlines, carriage returns or both (Microsloth Windows). May or may not contain space and other whitespace between tags. For this reason it is important to what I call Normalize the data before feeding it to the parser. You can perform this with regular expressions or plain old str_replace and concatenation. In many cases this can be done to the file it self sometimes to string data on the fly( as shown in the example above). But I feel it is important to normalize the data before even calling the function to call xml_parse. If you have the ability to access all data before that call you can convert it to what you fell the data should have been in the first place and omit many surprises and expensive regular expression substitution (in a tight spot) while fread'ing the data.
For a simple XML parser you can use this function. It doesn't require any extensions to run.
<?php
// Extracts content from XML tag
function GetElementByName ($xml, $start, $end) {
global $pos;
$startpos = strpos($xml, $start);
if ($startpos === false) {
return false;
}
$endpos = strpos($xml, $end);
$endpos = $endpos+strlen($end);
$pos = $endpos;
$endpos = $endpos-$startpos;
$endpos = $endpos - strlen($end);
$tag = substr ($xml, $startpos, $endpos);
$tag = substr ($tag, strlen($start));
return $tag;
}
// Open and read xml file. You can replace this with your xml data.
$file = "data.xml";
$pos = 0;
$Nodes = array();
if (!($fp = fopen($file, "r"))) {
die("could not open XML input");
}
while ($getline = fread($fp, 4096)) {
$data = $data . $getline;
}
$count = 0;
$pos = 0;
// Goes throw XML file and creates an array of all <XML_TAG> tags.
while ($node = GetElementByName($data, "<XML_TAG>", "</XML_TAG>")) {
$Nodes[$count] = $node;
$count++;
$data = substr($data, $pos);
}
// Gets infomation from tag siblings.
for ($i=0; $i<$count; $i++) {
$code = GetElementByName($Nodes[$i], "<Code>", "</Code>");
$desc = GetElementByName($Nodes[$i], "<Description>", "</Description>");
$price = GetElementByName($Nodes[$i], "<BasePrice>", "</BasePrice>");
}
?>
Hope this helps! :)
Guy Laor
Some reference code I am working on as "XML Library" of which I am folding it info an object. Notice the use of the DEFINE:
Mainly Example 1 and parts of 2 & 3 re-written as an object:
--- MyXMLWalk.lib.php ---
<?php
if (!defined("PHPXMLWalk")) {
define("PHPXMLWalk",TRUE);
class XMLWalk {
var $p; //short for xml parser;
var $e; //short for element stack/array
function prl($x,$i=0) {
ob_start();
print_r($x);
$buf=ob_get_contents();
ob_end_clean();
return join("\n".str_repeat(" ",$i),split("\n",$buf));
}
function XMLWalk() {
$this->p = xml_parser_create();
$this->e = array();
xml_parser_set_option($this->p, XML_OPTION_CASE_FOLDING, true);
xml_set_element_handler($this->p, array(&$this, "startElement"), array(&$this, "endElement"));
xml_set_character_data_handler($this->p, array(&$this, "dataElement"));
register_shutdown_function(array(&$this, "free")); // make a destructor
}
function startElement($parser, $name, $attrs) {
if (count($attrs)>=1) {
$x = $this->prl($attrs, $this->e[$parser]+6);
} else {
$x = "";
}
print str_repeat(" ",$this->e[$parser]+0). "$name $x\n";
$this->e[$parser]++;
$this->e[$parser]++;
}
function dataElement($parser, $data) {
print str_repeat(" ",$this->e[$parser]+0). htmlspecialchars($data, ENT_QUOTES) ."\n";
}
function endElement($parser, $name) {
$this->e[$parser]--;
$this->e[$parser]--;
}
function parse($data, $fp) {
if (!xml_parse($this->p, $data, feof($fp))) {
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($this->p)),
xml_get_current_line_number($this->p)));
}
}
function free() {
xml_parser_free($this->p);
}
} // end of class
} // end of define
?>
--- end of file ---
Calling code:
<?php
...
require("MyXMLWalk.lib.php");
$file = "x.xml";
$xme = new XMLWalk;
if (!($fp = fopen($file, "r"))) {
die("could not open XML input");
}
while ($data = fread($fp, 4096)) {
$xme->parse($data, $fp);
}
...
?>
[Editor's note: see also xml_parse_into_struct().]
Very simple routine to convert an XML file into a PHP structure. $obj->xml contains the resulting PHP structure. I would be interested if someone could suggest a cleaner method than the evals I am using.
<?php
$filename = 'sample.xml';
$obj->tree = '$obj->xml';
$obj->xml = '';
function startElement($parser, $name, $attrs) {
global $obj;
// If var already defined, make array
eval('$test=isset('.$obj->tree.'->'.$name.');');
if ($test) {
eval('$tmp='.$obj->tree.'->'.$name.';');
eval('$arr=is_array('.$obj->tree.'->'.$name.');');
if (!$arr) {
eval('unset('.$obj->tree.'->'.$name.');');
eval($obj->tree.'->'.$name.'[0]=$tmp;');
$cnt = 1;
}
else {
eval('$cnt=count('.$obj->tree.'->'.$name.');');
}
$obj->tree .= '->'.$name."[$cnt]";
}
else {
$obj->tree .= '->'.$name;
}
if (count($attrs)) {
eval($obj->tree.'->attr=$attrs;');
}
}
function endElement($parser, $name) {
global $obj;
// Strip off last ->
for($a=strlen($obj->tree);$a>0;$a--) {
if (substr($obj->tree, $a, 2) == '->') {
$obj->tree = substr($obj->tree, 0, $a);
break;
}
}
}
function characterData($parser, $data) {
global $obj;
eval($obj->tree.'->data=\''.$data.'\';');
}
$xml_parser = xml_parser_create();
xml_set_element_handler($xml_parser, "startElement", "endElement");
xml_set_character_data_handler($xml_parser, "characterData");
if (!($fp = fopen($filename, "r"))) {
die("could not open XML input");
}
while ($data = fread($fp, 4096)) {
if (!xml_parse($xml_parser, $data, feof($fp))) {
die(sprintf("XML error: %s at line %d",
xml_error_string(xml_get_error_code($xml_parser)),
xml_get_current_line_number($xml_parser)));
}
}
xml_parser_free($xml_parser);
print_r($obj->xml);
return 0;
?>
I had to TRIM the data when I passed one large String containig a wellformed XML-File to xml_parse. The String was read by CURL, which aparently put a BLANK at the end of the String. This BLANK produced a "XML not wellformed"-Error in xml_parse!
I've discovered some unusual behaviour in this API when ampersand entities are parsed in cdata; for some reason the parser breaks up the section around the entities, and calls the handler repeated times for each of the sections. If you don't allow for this oddity and you are trying to put the cdata into a variable, only the last part will be stored.
You can get around this with a line like:
$foo .= $cdata;
If the handler is called several times from the same tag, it will append them, rather than rewriting the variable each time. If the entire cdata section is returned, it doesn't matter.
May happen for other entities, but I haven't investigated.
Took me a while to figure out what was happening; hope this saves someone else the trouble.
When using the XML parser, make sure you're not using the magic quotes option (e.g. use set_magic_quotes_runtime(0) if it's not the compiled default), otherwise you'll get 'not well-formed' errors when dealing with tags with attributes set in them.
