Sleep/delay processor
Core v2.2.0
The
processor pauses scraper execution for a specified time period.
This is essential for rate limiting, respecting robots.txt crawl delays, waiting for
dynamic content, retry backoff strategies, and avoiding server overload.
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<loop item="url" maxloops="100">
<tokenize>${urls}</tokenize>
<http url="${url}"/>
<!-- Wait 2 seconds between requests -->
<sleep milliseconds="2000"/>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="retryCount" value="0"/>
<def var="success" value="false"/>
<while condition="${!success && retryCount < 5}">
<def var="response">
<http url="https://api.example.com/data"/>
</def>
<if condition="${empty(response)}">
<!-- Exponential backoff: 1s, 2s, 4s, 8s, 16s -->
<def var="delay">
<script>Math.pow(2, parseInt(context.getVar("retryCount"))) * 1000</script>
</def>
<sleep milliseconds="${delay}"/>
<def var="retryCount">
<script>parseInt(context.getVar("retryCount")) + 1</script>
</def>
</if>
<else>
<def var="success" value="true"/>
</else>
</while>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<!-- API allows 10 requests per minute -->
<loop item="endpoint" maxloops="10">
<tokenize>${endpoints}</tokenize>
<http url="https://api.example.com/${endpoint}"/>
<!-- 6 seconds = 10 requests/minute -->
<sleep milliseconds="6000"/>
</loop>
</config>