/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.transform.tokenize.builder;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.transform.tokenize.DocumentRepresentation;
import org.apache.sysds.runtime.transform.tokenize.Token;
import org.apache.sysds.runtime.transform.tokenize.builder.TokenizerBuilder;
import org.apache.sysds.runtime.util.UtilFunctions;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

public class TokenizerBuilderWhitespaceSplit
extends TokenizerBuilder {
    private static final long serialVersionUID = 539127244034913364L;
    private final int[] idCols;
    private final int tokenizeCol;
    public String regex = "\\s+";

    public TokenizerBuilderWhitespaceSplit(int[] idCols, int tokenizeCol, JSONObject params) throws JSONException {
        if (params != null && params.has("regex")) {
            this.regex = params.getString("regex");
        }
        this.idCols = idCols;
        this.tokenizeCol = tokenizeCol;
    }

    public List<Token> splitToTokens(String text) {
        ArrayList<Token> tokenList = new ArrayList<Token>();
        if (text == null) {
            return tokenList;
        }
        String[] textTokens = text.split(this.regex);
        int curIndex = 0;
        for (String textToken : textTokens) {
            int tokenIndex;
            if (Objects.equals(textToken, "")) continue;
            curIndex = tokenIndex = text.indexOf(textToken, curIndex);
            tokenList.add(new Token(textToken, tokenIndex));
        }
        return tokenList;
    }

    @Override
    public void createInternalRepresentation(FrameBlock in, DocumentRepresentation[] internalRepresentation, int rowStart, int blk) {
        int endIndex = UtilFunctions.getEndIndex(in.getNumRows(), rowStart, blk);
        for (int i = rowStart; i < endIndex; ++i) {
            String text = in.getString(i, this.tokenizeCol - 1);
            List<Token> tokenList = this.splitToTokens(text);
            ArrayList<Object> keys = new ArrayList<Object>();
            int[] nArray = this.idCols;
            int n = nArray.length;
            for (int j = 0; j < n; ++j) {
                Integer idCol = nArray[j];
                Object key = in.get(i, idCol - 1);
                keys.add(key);
                internalRepresentation[i] = new DocumentRepresentation(keys, tokenList);
            }
        }
    }
}

