QB64 Phoenix Edition
String Tokenizer - Printable Version

+- QB64 Phoenix Edition (https://staging.qb64phoenix.com)
+-- Forum: QB64 Rising (https://staging.qb64phoenix.com/forumdisplay.php?fid=1)
+--- Forum: Code and Stuff (https://staging.qb64phoenix.com/forumdisplay.php?fid=3)
+---- Forum: Utilities (https://staging.qb64phoenix.com/forumdisplay.php?fid=8)
+---- Thread: String Tokenizer (/showthread.php?tid=1700)



String Tokenizer - a740g - 05-25-2023

Not sure if anyone will find this useful. I based my code off another simple string tokenizer that I found on the old QB64 forum. However, that one was too simple for my needs. So, I made some changes.

Original: Split1000 (simple string parser) Collaboration (alephc.xyz)


Code: (Select All)
$CONSOLE:ONLY
OPTION _EXPLICIT

REDIM mytokens(-2 TO -2) AS STRING

DIM s AS STRING: s = "Function MyFunc(MyStr As String, Optional MyArg1 As Integer = 5, Optional MyArg2 = 'Dolores Abernathy')"

DIM n AS LONG: n = TokenizeString(s, "(),= ", 0, "''", mytokens())
PRINT n; " tokens parsed for: "; s

DIM i AS LONG
FOR i = LBOUND(mytokens) TO UBOUND(mytokens)
    PRINT i; "="; mytokens(i)
    SLEEP 1
NEXT

END

' Tokenizes a string to a dynamic string array
' text - is the input string
' delims - is a list of delimiters (multiple delimiters can be specified)
' tokens() - is the array that will hold the tokens
' returnDelims - if True, then the routine will also return the delimiters in the correct position in the tokens array
' quoteChars - is the string containing the opening and closing "quote" characters. Should be 2 chars only
' Returns: the number of tokens parsed
FUNCTION TokenizeString& (text AS STRING, delims AS STRING, returnDelims AS _BYTE, quoteChars AS STRING, tokens() AS STRING)
    DIM strLen AS LONG: strLen = LEN(text)

    IF strLen = 0 THEN EXIT FUNCTION ' nothing to be done

    DIM arrIdx AS LONG: arrIdx = LBOUND(tokens) ' we'll always start from the array lower bound - whatever it is
    DIM insideQuote AS _BYTE ' flag to track if currently inside a quote

    DIM token AS STRING ' holds a token until it is ready to be added to the array
    DIM char AS STRING * 1 ' this is a single char from text we are iterating through
    DIM AS LONG i, count

    ' Iterate through the characters in the text string
    FOR i = 1 TO strLen
        char = CHR$(ASC(text, i))
        IF insideQuote THEN
            IF char = RIGHT$(quoteChars, 1) THEN
                ' Closing quote char encountered, resume delimiting
                insideQuote = 0
                GOSUB add_token ' add the token to the array
                IF returnDelims THEN GOSUB add_delim ' add the closing quote char as delimiter if required
            ELSE
                token = token + char ' add the character to the current token
            END IF
        ELSE
            IF char = LEFT$(quoteChars, 1) THEN
                ' Opening quote char encountered, temporarily stop delimiting
                insideQuote = -1
                GOSUB add_token ' add the token to the array
                IF returnDelims THEN GOSUB add_delim ' add the opening quote char as delimiter if required
            ELSEIF INSTR(delims, char) = 0 THEN
                token = token + char ' add the character to the current token
            ELSE
                GOSUB add_token ' found a delimiter, add the token to the array
                IF returnDelims THEN GOSUB add_delim ' found a delimiter, add it to the array if required
            END IF
        END IF
    NEXT

    GOSUB add_token ' add the final token if there is any

    IF count > 0 THEN REDIM _PRESERVE tokens(LBOUND(tokens) TO arrIdx - 1) AS STRING ' resize the array to the exact size

    TokenizeString = count

    EXIT FUNCTION

    ' Add the token to the array if there is any
    add_token:
    IF LEN(token) > 0 THEN
        tokens(arrIdx) = token ' add the token to the token array
        token = "" ' clear the current token
        GOSUB increment_counters_and_resize_array
    END IF
    RETURN

    ' Add delimiter to array if required
    add_delim:
    tokens(arrIdx) = char ' add delimiter to array
    GOSUB increment_counters_and_resize_array
    RETURN

    ' Increment the count and array index and resize the array if needed
    increment_counters_and_resize_array:
    count = count + 1 ' increment the token count
    arrIdx = arrIdx + 1 ' move to next position
    IF arrIdx > UBOUND(tokens) THEN REDIM _PRESERVE tokens(LBOUND(tokens) TO UBOUND(tokens) + 512) AS STRING ' resize in 512 chunks
    RETURN
END FUNCTION

[Image: Screenshot-2023-05-26-055552.png]


RE: String Tokenizer - RhoSigma - 05-25-2023

Yes, I remember once there was a real hype there about string parsing/splitting/tokenizing functions, here's my approach. Originally made for my GuiTools project, but as it has no dependencies with GuiTools it can also used as standalone function.

ParseLine&() function


RE: String Tokenizer - a740g - 05-25-2023

(05-25-2023, 09:33 PM)RhoSigma Wrote: Yes, I remember once there was a real hype there about string parsing/splitting/tokenizing functions, her's was my approach. Originally made for my GuiTools project, but as it has no dependencies with GuiTools it can also used as standalone function.

ParseLine&() function

Thanks @RhoSigma. This is truly awesome. I'm not sure why I did not find it earlier while searching the old forums.

I did find one of your posts on this forum: Text Parser (qb64phoenix.com)

But the link you posted in that is broken, I guess.


RE: String Tokenizer - Kernelpanic - 05-25-2023

I know the "StringTokenizer" class from Java. Recreating this might not be easy. It would probably make more sense to be able to call a corresponding program in Java from QB64 with the transfer of a text. Just like it is with C.

In Java:
Code: (Select All)
/* StrinkTokenizer Beispiel - 26. Mai 2023 */

import java.util.*;

public class BeispielToken
{
   public static void main(String[] args)
   {
     String s = "Dies ist nur ein Test";
     StringTokenizer st = new StringTokenizer(s);
     while (st.hasMoreTokens())
         {
       System.out.println(st.nextToken());
     }
   }
}

[Image: String-Tokenizer2023-05-26.jpg]


RE: String Tokenizer - a740g - 05-26-2023

(05-25-2023, 11:04 PM)Kernelpanic Wrote: I know the "StringTokenizer" class from Java. Recreating this might not be easy. It would probably make more sense to be able to call a corresponding program in Java from QB64 with the transfer of a text. Just like it is with C.

In Java:
Code: (Select All)
/* StrinkTokenizer Beispiel - 26. Mai 2023 */

import java.util.*;

public class BeispielToken
{
   public static void main(String[] args)
   {
     String s = "Dies ist nur ein Test";
     StringTokenizer st = new StringTokenizer(s);
     while (st.hasMoreTokens())
         {
       System.out.println(st.nextToken());
     }
   }
}

[Image: String-Tokenizer2023-05-26.jpg]

The Java StringTokenizer is exactly what the design of this is based on. And after looking at RhoSigma's code I took some inspiration and got carried away. lol.

Code: (Select All)
$CONSOLE:ONLY
OPTION _EXPLICIT

REDIM mytokens(-2 TO -2) AS STRING

DIM s AS STRING: s = "Function MyFunc(MyStr As String, Optional MyArg1 As Integer = 5, Optional MyArg2 = 'Dolores Abernathy')"

DIM n AS LONG: n = TokenizeString(s, "(),= ", 0, "''", mytokens())
PRINT n; " tokens parsed"

DIM i AS LONG
FOR i = LBOUND(mytokens) TO UBOUND(mytokens)
    PRINT i; "="; mytokens(i)
    SLEEP 1
NEXT

END

' Tokenizes a string to a dynamic string array
' text - is the input string
' delims - is a list of delimiters (multiple delimiters can be specified)
' tokens() - is the array that will hold the tokens
' returnDelims - if True, then the routine will also return the delimiters in the correct position in the tokens array
' quoteChars - is the string containing the opening and closing "quote" characters. Should be 2 chars only
' Returns: the number of tokens parsed
FUNCTION TokenizeString& (text AS STRING, delims AS STRING, returnDelims AS _BYTE, quoteChars AS STRING, tokens() AS STRING)
    DIM strLen AS LONG: strLen = LEN(text)

    IF strLen = 0 THEN EXIT FUNCTION ' nothing to be done

    DIM arrIdx AS LONG: arrIdx = LBOUND(tokens) ' we'll always start from the array lower bound - whatever it is
    DIM insideQuote AS _BYTE ' flag to track if currently inside a quote

    DIM token AS STRING ' holds a token until it is ready to be added to the array
    DIM char AS STRING * 1 ' this is a single char from text we are iterating through
    DIM AS LONG i, count

    ' Iterate through the characters in the text string
    FOR i = 1 TO strLen
        char = CHR$(ASC(text, i))
        IF insideQuote THEN
            IF char = RIGHT$(quoteChars, 1) THEN
                ' Closing quote char encountered, resume delimiting
                insideQuote = 0
                GOSUB add_token ' add the token to the array
                IF returnDelims THEN GOSUB add_delim ' add the closing quote char as delimiter if required
            ELSE
                token = token + char ' add the character to the current token
            END IF
        ELSE
            IF char = LEFT$(quoteChars, 1) THEN
                ' Opening quote char encountered, temporarily stop delimiting
                insideQuote = -1
                GOSUB add_token ' add the token to the array
                IF returnDelims THEN GOSUB add_delim ' add the opening quote char as delimiter if required
            ELSEIF INSTR(delims, char) = 0 THEN
                token = token + char ' add the character to the current token
            ELSE
                GOSUB add_token ' found a delimiter, add the token to the array
                IF returnDelims THEN GOSUB add_delim ' found a delimiter, add it to the array if required
            END IF
        END IF
    NEXT

    GOSUB add_token ' add the final token if there is any

    IF count > 0 THEN REDIM _PRESERVE tokens(LBOUND(tokens) TO arrIdx - 1) AS STRING ' resize the array to the exact size

    TokenizeString = count

    EXIT FUNCTION

    ' Add the token to the array if there is any
    add_token:
    IF LEN(token) > 0 THEN
        tokens(arrIdx) = token ' add the token to the token array
        token = "" ' clear the current token
        GOSUB increment_counters_and_resize_array
    END IF
    RETURN

    ' Add delimiter to array if required
    add_delim:
    tokens(arrIdx) = char ' add delimiter to array
    GOSUB increment_counters_and_resize_array
    RETURN

    ' Increment the count and array index and resize the array if needed
    increment_counters_and_resize_array:
    count = count + 1 ' increment the token count
    arrIdx = arrIdx + 1 ' move to next position
    IF arrIdx > UBOUND(tokens) THEN REDIM _PRESERVE tokens(LBOUND(tokens) TO UBOUND(tokens) + 512) AS STRING ' resize in 512 chunks
    RETURN
END FUNCTION


I'll update the main post.


RE: String Tokenizer - Ultraman - 05-26-2023

I am a fan of using strtok. My tokenize function worked quite well as a wrapper for it.


RE: String Tokenizer - Kernelpanic - 05-26-2023

I tried to write a program in C that corresponds to the StringTokenizer in Java, and then call it from QB64, but it does not work.
There are no problems when compiling, but the program crashes when run.

I've tried everything I can think of for over two hours, and according to the manuals, I don't know why the program crashes.
The developers of QB64 know C/C++ - why does the program crash? Where is the mistake?

Code: (Select All)
//Beispiel für StringTokenizer aus Java in C
//Schildt, S.338 - 26. Mai 2023

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(void)
{
    char *text;
    
    text = strtok("They came never back!", " ");
    printf(text);
    printf("\n\n");
    do
    {
        text = strtok('\0', " ");
        if (text)
        {
            printf("\n%s", text);
        }        
    }while(text);
    
    return(0);
}

[Image: Token-Problem2023-05-26.jpg]


RE: String Tokenizer - Kernelpanic - 05-30-2023

Ok, I have get this right now. Let's see if this can also be integrated into Basic. That would be good, because in QB64 it's far too complicated.

In C and Java (even easier) one have it. Why reinvent the wheel? I think it would be much easier for the developers to allow access to a C or Java routine from Basic.

Well, I'm not a developer. It's just an idea.


Code: (Select All)
//Beispiel aus: https://www.proggen.org/doku.php?id=c:lib:string:strtok
//Zeichenkette zerlegen in ihre einzelnen Wörter - 30. Mai 2023

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX 100

int main(void)
{
  char zk[] = "They never came back!";
    char zk2[MAX];
  char gesuchtes_zeichen[] = " "; 
  char *teil_wort;
    int wort = 1;
    
    //Zeichenkette in eine andere kopieren
    //funktioniert nur so.
    strcpy(zk2, zk);
    printf ("Zerlege Text: %s\n", zk );
  teil_wort = strtok(zk, gesuchtes_zeichen);
  while(teil_wort)
  {
    printf("Wort%2d: %s\n", wort++, teil_wort);
    teil_wort = strtok(NULL, gesuchtes_zeichen);
  }
  printf("\nAusgabe aufgeteilter Text: %s\n", zk2);
    
  return(0);
}

[Image: ZK-zerlegen-Token.jpg]


RE: String Tokenizer - Ultraman - 06-29-2023

I completely forgot to come back and show the tokenize function I used to use that took advantage of strtok:

Code: (Select All)

Option _Explicit

Dim As _Offset tokenized
Dim As String toTokenize: toTokenize = "36" + Chr$(9) + "Hungry Coyote Import Store" + Chr$(9) + "Yoshi Latimer" + Chr$(9) + "City Center Plaza 516 Main St" + Chr$(9) + "Elgin" + Chr$(9) + "97827" + Chr$(9) + "USA" + Chr$(10) + "37" + Chr$(9) + "Hungry Owl All-Night Grocers" + Chr$(9) + "Patricia McKenna" + Chr$(9) + "8 Johnstown Road" + Chr$(9) + "Cork" + Chr$(9) + "" + Chr$(9) + "Ireland" + Chr$(10) + "38" + Chr$(9) + "Island Trading" + Chr$(9) + "Helen Bennett" + Chr$(9) + "Garden House Crowther Way" + Chr$(9) + "Cowes" + Chr$(9) + "PO31 7PJ" + Chr$(9) + "UK" + Chr$(10) '+ Chr$(0)
Dim As String delimiter: delimiter = Chr$(9) + Chr$(10)

Dim As Long i
ReDim As String tokenized(0 To 0)
tokenize toTokenize, Chr$(9) + Chr$(10), tokenized()

For i = LBound(tokenized) To UBound(tokenized)
    Print tokenized(i)
Next

Function pointerToString$ (pointer As _Offset)
    Declare CustomType Library
        Function strlen%& (ByVal ptr As _Unsigned _Offset)
    End Declare
    Dim As _Offset length: length = strlen(pointer)
    If length Then
        Dim As _MEM pString: pString = _Mem(pointer, length)
        Dim As String ret: ret = Space$(length)
        _MemGet pString, pString.OFFSET, ret
        _MemFree pString
    End If
    pointerToString = ret
End Function

Sub tokenize (toTokenize As String, delimiters As String, StorageArray() As String)
    Declare CustomType Library
        Function strtok%& (ByVal str As _Offset, delimiters As String)
    End Declare
    Dim As _Offset tokenized
    Dim As String tokCopy: If Right$(toTokenize, 1) <> Chr$(0) Then tokCopy = toTokenize + Chr$(0) Else tokCopy = toTokenize
    Dim As String delCopy: If Right$(delimiters, 1) <> Chr$(0) Then delCopy = delimiters + Chr$(0) Else delCopy = delimiters
    Dim As _Unsigned Long lowerbound: lowerbound = LBound(StorageArray)
    Dim As _Unsigned Long i: i = lowerbound
    tokenized = strtok(_Offset(tokCopy), delCopy)
    While tokenized <> 0
        ReDim _Preserve StorageArray(lowerbound To UBound(StorageArray) + 1)
        StorageArray(i) = pointerToString(tokenized)
        tokenized = strtok(0, delCopy)
        i = i + 1
    Wend
    ReDim _Preserve StorageArray(UBound(StorageArray) - 1)
End Sub