Master advanced scraping patterns
Step-by-step guide for login, form submission, pagination handling, and detail page scraping.
Complete workflow from login to data extraction
Submit login forms with POST requests
Iterate through multiple pages automatically
Extract links and scrape individual detail pages
Store extracted data to files
Authenticate with POST request
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Submit login form -->
<def var="loginResponse">
<http url="https://example.com/login" method="POST">
<http-param name="username" value="myuser"/>
<http-param name="password" value="mypass"/>
</http>
</def>
<!-- Save for debugging -->
<file path="output/login-response.html" action="write">
${loginResponse}
</file>
</config>
method="POST" for form submission.
HTTP session cookies are automatically maintained across requests.
Get all detail page URLs
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Fetch results page -->
<def var="resultsPage">
<http url="https://example.com/products"/>
</def>
<!-- Extract all product links -->
<def var="productLinks">
<xpath expression="//a[@class='product-link']/@href">
<html-to-xml>${resultsPage}</html-to-xml>
</xpath>
</def>
<log message="Found ${productLinks.length} product links"/>
</config>
Scrape each product page
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- Loop through each product link -->
<loop item="link" index="i">
${productLinks}
<!-- Fetch detail page -->
<def var="detailPage">
<http url="${link}"/>
</def>
<!-- Extract product data -->
<def var="productName">
<xpath expression="//h1[@class='title']/text()">
<html-to-xml>${detailPage}</html-to-xml>
</xpath>
</def>
<def var="price">
<xpath expression="//span[@class='price']/text()">
<html-to-xml>${detailPage}</html-to-xml>
</xpath>
</def>
<!-- Save to file -->
<file path="output/product-${i}.txt" action="write">
Name: ${productName}
Price: ${price}
</file>
<!-- Be polite - wait 2 seconds -->
<sleep time="2000"/>
</loop>
</config>
Automatically navigate through multiple pages
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="currentPage">1</def>
<def var="hasNextPage">true</def>
<while condition="${hasNextPage}">
<!-- Fetch page -->
<def var="page">
<http url="https://example.com/products?page=${currentPage}"/>
</def>
<!-- Extract and process products on this page -->
<!-- ... your extraction logic ... -->
<!-- Check if "Next" button exists -->
<def var="nextButton">
<xpath expression="//a[@class='next-page']">
<html-to-xml>${page}</html-to-xml>
</xpath>
</def>
<!-- Update loop condition -->
<set-var name="hasNextPage">
<script>!context.getVar('nextButton').isEmpty()</script>
</set-var>
<!-- Increment page counter -->
<set-var name="currentPage">
<script>parseInt(context.getVar('currentPage')) + 1</script>
</set-var>
<sleep time="2000"/>
</while>
</config>
<while> loop continues as long as
the "Next" button exists on the page. When there's no next page, the loop stops.