Example Configurations
Real-world examples demonstrating Web-Harvest capabilities and best practices
Available Example Files
Download and run these real-world example configurations:
Simple Test
Basic configuration to test Web-Harvest functionality
E-commerce Monitoring
Monitor product prices across e-commerce sites
Social Media Analytics
Extract and analyze social media data
Data Processing Pipeline
Complete data processing workflow
Web Crawler
Intelligent web crawler with link following
API Integration
REST API integration and data processing
Modern Web Scraping
Advanced web scraping techniques
Product Catalog
Extract product information from catalogs
XQuery Examples
Advanced XQuery data processing
Function Examples
Built-in function usage examples
E-commerce Price Monitoring
Monitor product prices across multiple e-commerce sites and get alerts when prices drop below threshold.
Features:
- Multi-site price monitoring
- Price threshold alerts
- Data persistence
- Error handling
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Configuration for price monitoring -->
<def var="productUrl">https://shop.example.com/product/123</def>
<def var="priceThreshold">100.00</def>
<!-- Fetch product data -->
<def var="productData">
<html-to-xml advancedxmlescape="true" specialentities="true">
<http url="${productUrl}" timeout="30000">
<http-header name="User-Agent">Web-Harvest Bot</http-header>
</http>
</html-to-xml>
</def>
<!-- Extract current price -->
<def var="currentPrice">
<xpath expression="//span[@class='price']">
<get var="productData"/>
</xpath>
</def>
<!-- Check if price is below threshold -->
<if condition="${currentPrice < priceThreshold}">
<then>
<file path="price_alert.txt" action="write">
<template>
<![CDATA[
PRICE ALERT!
Product: ${productUrl}
Current Price: $${currentPrice}
Threshold: $${priceThreshold}
Date: ${sys.currentTime()}
]]>
</template>
</file>
</then>
</if>
</config>
Social Media Analytics
Extract and analyze social media posts for sentiment analysis and engagement metrics.
Features:
- API integration
- JSON to XML conversion
- XQuery data processing
- Sentiment analysis
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="hashtag">#technology</def>
<!-- Get data from social media API -->
<def var="socialData">
<http url="https://api.social.com/posts?hashtag=${hashtag}">
<http-header name="Authorization">Bearer YOUR_API_KEY</http-header>
</http>
</def>
<!-- Convert JSON to XML -->
<def var="postsXml">
<json-to-xml>
<get var="socialData"/>
</json-to-xml>
</def>
<!-- Analyze sentiment using XQuery -->
<def var="sentimentAnalysis">
<xquery>
<xq-param name="posts">
<get var="postsXml"/>
</xq-param>
<xq-expression>
<![CDATA[
declare variable $posts as node() external;
let $positive := count($posts//post[sentiment = "positive"])
let $negative := count($posts//post[sentiment = "negative"])
let $neutral := count($posts//post[sentiment = "neutral"])
let $total := count($posts//post)
return
<sentiment_analysis>
<total_posts>{$total}</total_posts>
<positive_posts>{$positive}</positive_posts>
<negative_posts>{$negative}</negative_posts>
<neutral_posts>{$neutral}</neutral_posts>
<positive_percentage>{if ($total > 0) then round($positive div $total * 100) else 0}</positive_percentage>
<negative_percentage>{if ($total > 0) then round($negative div $total * 100) else 0}</negative_percentage>
<neutral_percentage>{if ($total > 0) then round($neutral div $total * 100) else 0}</neutral_percentage>
</sentiment_analysis>
]]>
</xq-expression>
</xquery>
</def>
<!-- Save results -->
<file path="sentiment_report.xml" action="write">
<get var="sentimentAnalysis"/>
</file>
</config>
Data Processing Pipeline
Complete data processing pipeline with validation, transformation, and reporting.
Features:
- CSV data processing
- Data validation
- Error handling
- Report generation
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Read input data -->
<def var="rawData">
<file path="input.csv" action="read" charset="UTF-8"/>
</def>
<!-- Process each row -->
<def var="processedData">
<loop item="row">
<tokenize delimiters="," trimtokens="true">
<get var="rawData"/>
</tokenize>
<body>
<!-- Validate row data -->
<def var="validatedRow">
<script>
// Data validation logic
if (row.length() >= 3) {
return row;
} else {
return null;
}
</script>
</def>
<!-- Only process valid rows -->
<if condition="${validatedRow != null}">
<then>
<def var="processedRow">
<template>
<![CDATA[
<item>
<id>${row[0]}</id>
<name>${row[1]}</name>
<price>${row[2]}</price>
<processed_at>${sys.currentTime()}</processed_at>
</item>
]]>
</template>
</def>
</then>
</if>
</body>
</loop>
</def>
<!-- Generate report -->
<file path="processed_data.xml" action="write">
<template>
<![CDATA[
<report>
<timestamp>${sys.currentTime()}</timestamp>
<total_records>${sys.recordCount()}</total_records>
<data>
<get var="processedData"/>
</data>
</report>
]]>
</template>
</file>
</config>
Web Crawler
Intelligent web crawler that follows links and extracts content from multiple pages.
Features:
- Link following
- Content extraction
- Duplicate prevention
- Error handling
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="home">http://web-harvest.sourceforge.net/</def>
<def var="unvisited">
<list>
<template>${home}</template>
</list>
</def>
<def var="visited">
<list></list>
</def>
<def var="newLinks">
<list></list>
</def>
<while condition="${unvisited.toString().length() > 0}">
<def var="currUrl">
<get var="unvisited"/>
</def>
<def var="visitedUpdate">
<script>
print("Visiting: " + currUrl);
</script>
</def>
<def var="linkCheck">
<script>
boolean isValidUrl(String url) {
String urlSmall = url.toLowerCase();
return urlSmall.startsWith("http://web-harvest.sourceforge.net/")
&& urlSmall.endsWith(".php");
}
String fullLink = sys.fullUrl(home, currLink);
if (isValidUrl(fullLink.toString())) {
print("Found valid link: " + fullLink);
}
</script>
</def>
<def var="unvisited">
<list></list>
</def>
</while>
</config>
API Integration
Integrate with REST APIs to fetch, process, and store data from external services.
Features:
- REST API calls
- Authentication
- Data transformation
- Database storage
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="apiKey">YOUR_API_KEY</def>
<def var="baseUrl">https://api.example.com/v1</def>
<!-- Fetch data from API -->
<def var="apiData">
<http url="${baseUrl}/products" method="GET">
<http-header name="Authorization">Bearer ${apiKey}</http-header>
<http-header name="Accept">application/json</http-header>
</http>
</def>
<!-- Convert to XML -->
<def var="xmlData">
<json-to-xml>
<get var="apiData"/>
</json-to-xml>
</def>
<!-- Process and store data -->
<loop item="product">
<xpath expression="//product">
<get var="xmlData"/>
</xpath>
<body>
<def var="productId">
<xpath expression=".//id">
<get var="product"/>
</xpath>
</def>
<def var="productName">
<xpath expression=".//name">
<get var="product"/>
</xpath>
</def>
<!-- Store in database -->
<database
driver="com.mysql.cj.jdbc.Driver"
url="jdbc:mysql://localhost:3306/products"
username="user"
password="pass"
query="INSERT INTO products (id, name) VALUES (?, ?)">
<db-param>${productId}</db-param>
<db-param>${productName}</db-param>
</database>
</body>
</loop>
</config>
Email Automation
Automated email notifications based on scraped data and business rules.
Features:
- Email sending
- Template processing
- Conditional logic
- Error handling
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Check for price alerts -->
<def var="alertData">
<file path="price_alerts.xml" action="read"/>
</def>
<def var="hasAlerts">
<xpath expression="count(//alert) > 0">
<get var="alertData"/>
</xpath>
</def>
<if condition="${hasAlerts}">
<then>
<def var="emailContent">
<template>
<![CDATA[
<html>
<body>
<h2>Price Alert Summary</h2>
<p>The following products have price alerts:</p>
<ul>
<li>Product: ${productName} - Price: $${price}</li>
</ul>
<p>Generated at: ${sys.currentTime()}</p>
</body>
</html>
]]>
</template>
</def>
<mail
to="admin@example.com"
from="scraper@example.com"
subject="Price Alert Notification"
smtphost="smtp.example.com"
smtpport="587">
<get var="emailContent"/>
</mail>
</then>
</if>
</config>