Text tokenizer processor
Core v2.2.0
The
processor splits text content into individual tokens (words, lines,
or custom-delimited parts) and returns them as a ListVariable for iteration with
.
This is useful for processing CSV data, splitting lines, parsing structured text, or any
scenario requiring text segmentation.
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="text" value="apple banana cherry date elderberry"/>
<loop item="fruit">
<tokenize>${text}</tokenize>
<file path="fruits.txt" action="append">
${fruit}
</file>
</loop>
<!-- Processes: apple, banana, cherry, date, elderberry -->
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="csvLine" value="John,Doe,john@example.com,30"/>
<loop item="field" index="i">
<tokenize delimiters=",">${csvLine}</tokenize>
<def var="field${i}" value="${field}"/>
</loop>
<!-- field0=John, field1=Doe, field2=john@example.com, field3=30 -->
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="urls">
https://example.com/page1
https://example.com/page2
https://example.com/page3
</def>
<loop item="url">
<tokenize delimiters=" ">${urls}</tokenize>
<http url="${url}"/>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="data" value="id:123;name:Product A;price:99.99"/>
<loop item="pair">
<tokenize delimiters=";">${data}</tokenize>
<!-- Each pair is "key:value" -->
<def var="key">
<tokenize delimiters=":">${pair}</tokenize>
</def>
</loop>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="text" value=" apple , banana , cherry "/>
<!-- With trimming (default) -->
<tokenize delimiters="," trimtokens="true">${text}</tokenize>
<!-- Result: ["apple", "banana", "cherry"] -->
<!-- Without trimming -->
<tokenize delimiters="," trimtokens="false">${text}</tokenize>
<!-- Result: [" apple ", " banana ", " cherry "] -->
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config xmlns="http://org.webharvest/schema/2.1/core">
<def var="data" value="apple,,cherry,,elderberry"/>
<!-- Skip empty tokens (default) -->
<tokenize delimiters="," allowemptytokens="false">${data}</tokenize>
<!-- Result: ["apple", "cherry", "elderberry"] -->
<!-- Keep empty tokens -->
<tokenize delimiters="," allowemptytokens="true">${data}</tokenize>
<!-- Result: ["apple", "", "cherry", "", "elderberry"] -->
</config>