Regular expression processor
Core v2.2.0
The
processor provides powerful regular expression capabilities for:
pattern matching, text extraction using capture groups, search and replace operations,
and complex text parsing. It automatically stores captured groups as variables for
easy access.
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="emails">
<regexp>
<regexp-pattern>([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})</regexp-pattern>
<regexp-source>
<http url="https://example.com/contacts"/>
</regexp-source>
</regexp>
</def>
<loop item="email">
${emails}
<file path="emails.txt" action="append">${email} </file>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="data">name:John;email:john@example.com;age:30</def>
<loop item="pair">
<regexp>
<regexp-pattern>([^;]+)</regexp-pattern>
<regexp-source>${data}</regexp-source>
</regexp>
<!-- For each pair, extract key and value -->
<regexp>
<regexp-pattern>([^:]+):([^:]+)</regexp-pattern>
<regexp-source>${pair}</regexp-source>
</regexp>
<!-- group1 = key, group2 = value (Bug #62: properly isolated!) -->
<def var="${group1}" value="${group2}"/>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="html">
<http url="https://example.com/page"/>
</def>
<!-- Replace all http:// with https:// -->
<def var="secureHtml">
<regexp replace="true">
<regexp-pattern>http://</regexp-pattern>
<regexp-source>${html}</regexp-source>
<regexp-result>https://</regexp-result>
</regexp>
</def>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<loop item="url">
<regexp>
<regexp-pattern>https?://([^/]+)(/.*)}</regexp-pattern>
<regexp-source>
<http url="https://example.com/links"/>
</regexp-source>
</regexp>
<!-- group0 = full URL, group1 = domain, group2 = path -->
<file path="urls.txt" action="append">
URL: ${group0}
Domain: ${group1}
Path: ${group2}
</file>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<regexp flag-caseinsensitive="true">
<regexp-pattern>error|warning|fail</regexp-pattern>
<regexp-source>
<file path="log.txt" action="read"/>
</regexp-source>
</regexp>
</config>
java.util.regex.Pattern