Regexp pattern definition
Core v2.2.0
The processor defines the regular expression pattern for matching and extraction.
Must be used as a child element of . Supports Java regex syntax with capture groups.
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<regexp>
<regexp-pattern><![CDATA[<a href="([^"]+)">]]></regexp-pattern>
<regexp-source>${htmlPage}</regexp-source>
</regexp>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<regexp>
<regexp-pattern><![CDATA[([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})]]></regexp-pattern>
<regexp-source>${textContent}</regexp-source>
</regexp>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<regexp>
<regexp-pattern><![CDATA[Price:\s*\$(\d+)\.(\d{2})]]></regexp-pattern>
<regexp-source>Price: $19.99</regexp-source>
<regexp-result index="1"/><!-- Returns "19" -->
<regexp-result index="2"/><!-- Returns "99" -->
</regexp>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<regexp>
<regexp-pattern><![CDATA[<div class="product" data-id="(\d+)" data-name="([^"]+)" data-price="([\d.]+)">]]></regexp-pattern>
<regexp-source>${productHtml}</regexp-source>
</regexp>
<!-- Access groups with <regexp-result index="1"/>, <regexp-result index="2"/>, etc. -->
</config>
element