<regexp>

Regular expression processor

Core v2.2.0

Overview

The processor provides powerful regular expression capabilities for: pattern matching, text extraction using capture groups, search and replace operations, and complex text parsing. It automatically stores captured groups as variables for easy access.

Usage Examples

Example 1: Extract email addresses

example-1.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="emails">
  <regexp>
    <regexp-pattern>([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})</regexp-pattern>
    <regexp-source>
      <http url="https://example.com/contacts"/>
    </regexp-source>
  </regexp>
</def>

<loop item="email">
  ${emails}
  
  <file path="emails.txt" action="append">${email}&#10;</file>
</loop>
</config>

Example 2: Parse key-value pairs with capture groups

example-2.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="data">name:John;email:john@example.com;age:30</def>

<loop item="pair">
  <regexp>
    <regexp-pattern>([^;]+)</regexp-pattern>
    <regexp-source>${data}</regexp-source>
  </regexp>
  
  <!-- For each pair, extract key and value -->
  <regexp>
    <regexp-pattern>([^:]+):([^:]+)</regexp-pattern>
    <regexp-source>${pair}</regexp-source>
  </regexp>
  
  <!-- group1 = key, group2 = value (Bug #62: properly isolated!) -->
  <def var="${group1}" value="${group2}"/>
</loop>
</config>

Example 3: Find and replace

example-3.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="html">
  <http url="https://example.com/page"/>
</def>

<!-- Replace all http:// with https:// -->
<def var="secureHtml">
  <regexp replace="true">
    <regexp-pattern>http://</regexp-pattern>
    <regexp-source>${html}</regexp-source>
    <regexp-result>https://</regexp-result>
  </regexp>
</def>
</config>

Example 4: Extract URLs with named groups

example-4.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<loop item="url">
  <regexp>
    <regexp-pattern>https?://([^/]+)(/.*)}</regexp-pattern>
    <regexp-source>
      <http url="https://example.com/links"/>
    </regexp-source>
  </regexp>
  
  <!-- group0 = full URL, group1 = domain, group2 = path -->
  <file path="urls.txt" action="append">
    URL: ${group0}
    Domain: ${group1}
    Path: ${group2}
  </file>
</loop>
</config>

Example 5: Case-insensitive search

example-5.xml
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<regexp flag-caseinsensitive="true">
  <regexp-pattern>error|warning|fail</regexp-pattern>
  <regexp-source>
    <file path="log.txt" action="read"/>
  </regexp-source>
</regexp>
</config>

Important Notes

Related Processors