HTML to well-formed XML converter
Core v2.2.0
The
processor is one of the most critical processors in WebHarvest.
It converts potentially malformed HTML from web pages into well-formed XML that can be
processed using XPath and XQuery. This is necessary because most web pages contain
invalid HTML (unclosed tags, missing quotes, etc.) that XML parsers reject.
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="productName">
<xpath expression="//h1[@class='product-title']/text()">
<html-to-xml>
<http url="https://example.com/product/123"/>
</html-to-xml>
</xpath>
</def>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Fetch page once -->
<def var="htmlPage">
<http url="https://example.com/article/456"/>
</def>
<!-- Convert to XML once -->
<def var="xmlPage">
<html-to-xml>${htmlPage}</html-to-xml>
</def>
<!-- Extract multiple fields -->
<def var="title">
<xpath expression="//h1/text()">
${xmlPage}
</xpath>
</def>
<def var="author">
<xpath expression="//meta[@name='author']/@content">
${xmlPage}
</xpath>
</def>
<def var="content">
<xpath expression="//div[@class='content']">
${xmlPage}
</xpath>
</def>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="links">
<xpath expression="//a/@href">
<html-to-xml>
<http url="https://example.com/links"/>
</html-to-xml>
</xpath>
</def>
<!-- Loop through all links -->
<loop item="link">
${links}
<http url="${link}"/>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="rows">
<xpath expression="//table[@id='products']/tr">
<html-to-xml>
<http url="https://example.com/products"/>
</html-to-xml>
</xpath>
</def>
<loop item="row">
${rows}
<def var="productName">
<xpath expression="td[1]/text()">${row}</xpath>
</def>
<def var="productPrice">
<xpath expression="td[2]/text()">${row}</xpath>
</def>
<!-- Save to file/database -->
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="articles">
<xpath expression="//article[contains(@class, 'news-item')]//h2/text()">
<html-to-xml>
<http url="https://news.example.com"/>
</html-to-xml>
</xpath>
</def>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Even badly broken HTML works -->
<def var="brokenHtml" value="<p>Unclosed paragraph<div>No closing div<b>Bold"/>
<html-to-xml>${brokenHtml}</html-to-xml>
<!-- Result: Well-formed XML with auto-closed tags -->
</config>
,
,
, or literal text