<config>

Root configuration processor

Core v2.2.0

Overview

The processor is the root element of every WebHarvest configuration. It defines global settings like charset and scripting language, and contains all other processors. Every WebHarvest scraper must start with this element.

Usage Examples

Example 1: Basic configuration

example-1.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core" charset="UTF-8">
  <def var="baseUrl">https://example.com</def>
  <def var="response">
    <http url="${baseUrl}/api/data"/>
  </def>
  <def var="processedData">
    <json-to-xml>${response}</json-to-xml>
  </def>
</config>

Example 2: With custom scripting language

example-2.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core" 
        charset="UTF-8" 
        scriptlang="groovy">
  <def var="apiKey">sk_live_abc123</def>
  <def var="timestamp">${new Date().getTime()}</def>
  
  <http url="https://api.example.com/data">
    <http-header name="Authorization">Bearer ${apiKey}</http-header>
    <http-header name="X-Timestamp">${timestamp}</http-header>
  </http>
</config>

Example 3: Complex scraping workflow

example-3.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core" charset="UTF-8">
  <!-- Global variables -->
  <def var="baseUrl">https://shop.example.com</def>
  <def var="category">electronics</def>
  
  <!-- Main scraping logic -->
  <def var="categoryPage">
    <html-to-xml>
      <http url="${baseUrl}/category/${category}"/>
    </html-to-xml>
  </def>
  
  <def var="productLinks">
    <xpath expression="//a[@class='product-link']/@href">
      <get var="categoryPage"/>
    </xpath>
  </def>
  
  <!-- Process each product -->
  <loop item="productUrl" list="${productLinks}">
    <def var="productData">
      <html-to-xml>
        <http url="${baseUrl}${productUrl}"/>
      </html-to-xml>
    </def>
    
    <def var="productName">
      <xpath expression="//h1[@class='product-title']/text()">
        <get var="productData"/>
      </xpath>
    </def>
    
    <def var="productPrice">
      <xpath expression="//span[@class='price']/text()">
        <get var="productData"/>
      </xpath>
    </def>
    
    <!-- Save product data -->
    <file path="products.xml" action="append">
      <product>
        <name>${productName}</name>
        <price>${productPrice}</price>
        <url>${baseUrl}${productUrl}</url>
      </product>
    </file>
  </loop>
</config>

Example 4: Error handling configuration

example-4.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core" charset="UTF-8">
  <def var="apiEndpoint">https://api.example.com/data</def>
  
  <try>
    <def var="response">
      <http url="${apiEndpoint}" timeout="30000"/>
    </def>
    
    <def var="data">
      <json-to-xml>${response}</json-to-xml>
    </def>
    
    <file path="success.log" action="append">
      Successfully processed API response at ${new Date()}
    </file>
  </try>
  <catch>
    <file path="error.log" action="append">
      Error processing API: ${exception.message} at ${new Date()}
    </file>
    
    <!-- Fallback to cached data -->
    <def var="data">
      <file path="cache/last-response.xml" action="read"/>
    </def>
  </catch>
</config>

Important Notes

Related Processors