Root configuration processor
Core v2.2.0
The
processor is the root element of every WebHarvest configuration.
It defines global settings like charset and scripting language, and contains all
other processors. Every WebHarvest scraper must start with this element.
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core" charset="UTF-8">
<def var="baseUrl">https://example.com</def>
<def var="response">
<http url="${baseUrl}/api/data"/>
</def>
<def var="processedData">
<json-to-xml>${response}</json-to-xml>
</def>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core"
charset="UTF-8"
scriptlang="groovy">
<def var="apiKey">sk_live_abc123</def>
<def var="timestamp">${new Date().getTime()}</def>
<http url="https://api.example.com/data">
<http-header name="Authorization">Bearer ${apiKey}</http-header>
<http-header name="X-Timestamp">${timestamp}</http-header>
</http>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core" charset="UTF-8">
<!-- Global variables -->
<def var="baseUrl">https://shop.example.com</def>
<def var="category">electronics</def>
<!-- Main scraping logic -->
<def var="categoryPage">
<html-to-xml>
<http url="${baseUrl}/category/${category}"/>
</html-to-xml>
</def>
<def var="productLinks">
<xpath expression="//a[@class='product-link']/@href">
<get var="categoryPage"/>
</xpath>
</def>
<!-- Process each product -->
<loop item="productUrl" list="${productLinks}">
<def var="productData">
<html-to-xml>
<http url="${baseUrl}${productUrl}"/>
</html-to-xml>
</def>
<def var="productName">
<xpath expression="//h1[@class='product-title']/text()">
<get var="productData"/>
</xpath>
</def>
<def var="productPrice">
<xpath expression="//span[@class='price']/text()">
<get var="productData"/>
</xpath>
</def>
<!-- Save product data -->
<file path="products.xml" action="append">
<product>
<name>${productName}</name>
<price>${productPrice}</price>
<url>${baseUrl}${productUrl}</url>
</product>
</file>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core" charset="UTF-8">
<def var="apiEndpoint">https://api.example.com/data</def>
<try>
<def var="response">
<http url="${apiEndpoint}" timeout="30000"/>
</def>
<def var="data">
<json-to-xml>${response}</json-to-xml>
</def>
<file path="success.log" action="append">
Successfully processed API response at ${new Date()}
</file>
</try>
<catch>
<file path="error.log" action="append">
Error processing API: ${exception.message} at ${new Date()}
</file>
<!-- Fallback to cached data -->
<def var="data">
<file path="cache/last-response.xml" action="read"/>
</def>
</catch>
</config>