/*
 * Decompiled with CFR 0.152.
 */
package org.carrot2.text.preprocessing;

import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntIntAssociativeContainer;
import com.carrotsearch.hppc.IntIntHashMap;
import com.carrotsearch.hppc.cursors.IntIntCursor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.Substring;
import org.carrot2.text.preprocessing.SubstringComparator;
import org.carrot2.text.preprocessing.SuffixSorter;
import org.carrot2.util.IntMapUtils;

public class PhraseExtractor {
    private static final int MIN_PHRASE_LENGTH = 2;
    static final int MAX_PHRASE_LENGTH = 8;
    private SuffixSorter suffixSorter = new SuffixSorter();
    private final int dfThreshold;

    PhraseExtractor(int dfThreshold) {
        this.dfThreshold = dfThreshold;
    }

    public void extractPhrases(PreprocessingContext context) {
        this.suffixSorter.suffixSort(context);
        int[] suffixArray = context.allTokens.suffixOrder;
        int[] lcpArray = context.allTokens.lcp;
        int[] wordIndexesArray = context.allTokens.wordIndex;
        int[] documentIndexArray = context.allTokens.documentIndex;
        int[] stemIndexes = context.allWords.stemIndex;
        List<Substring> rcs = this.discoverRcs(suffixArray, lcpArray, documentIndexArray);
        ArrayList<int[]> phraseWordIndexes = new ArrayList<int[]>();
        IntArrayList phraseTf = new IntArrayList();
        ArrayList<int[]> phraseTfByDocumentList = new ArrayList<int[]>();
        if (rcs.size() > 0) {
            Collections.sort(rcs, new SubstringComparator(wordIndexesArray, stemIndexes));
            int totalPhraseTf = rcs.get((int)0).frequency;
            Substring mostFrequentOriginal = rcs.get(0);
            IntIntHashMap phraseTfByDocument = new IntIntHashMap();
            phraseTfByDocument.putAll((IntIntAssociativeContainer)mostFrequentOriginal.tfByDocument);
            for (int i = 0; i < rcs.size() - 1; ++i) {
                Substring nextSubstring;
                Substring substring = rcs.get(i);
                if (substring.isEquivalentTo(nextSubstring = rcs.get(i + 1), wordIndexesArray, stemIndexes)) {
                    totalPhraseTf += nextSubstring.frequency;
                    PhraseExtractor.addAllWithOffset(phraseTfByDocument, nextSubstring.tfByDocument, -1);
                    if (mostFrequentOriginal.frequency >= nextSubstring.frequency) continue;
                    mostFrequentOriginal = nextSubstring;
                    continue;
                }
                int[] wordIndexes = new int[mostFrequentOriginal.to - mostFrequentOriginal.from];
                for (int j = 0; j < wordIndexes.length; ++j) {
                    wordIndexes[j] = wordIndexesArray[mostFrequentOriginal.from + j];
                }
                phraseWordIndexes.add(wordIndexes);
                phraseTf.add(totalPhraseTf);
                phraseTfByDocumentList.add(IntMapUtils.flatten(phraseTfByDocument));
                totalPhraseTf = nextSubstring.frequency;
                mostFrequentOriginal = nextSubstring;
                phraseTfByDocument.clear();
                phraseTfByDocument.putAll((IntIntAssociativeContainer)nextSubstring.tfByDocument);
            }
            Substring substring = rcs.get(rcs.size() - 1);
            int[] wordIndexes = new int[substring.to - substring.from];
            for (int j = 0; j < wordIndexes.length; ++j) {
                wordIndexes[j] = wordIndexesArray[mostFrequentOriginal.from + j];
            }
            phraseWordIndexes.add(wordIndexes);
            phraseTf.add(totalPhraseTf);
            phraseTfByDocumentList.add(IntMapUtils.flatten(phraseTfByDocument));
        }
        context.allPhrases.wordIndices = (int[][])phraseWordIndexes.toArray((T[])new int[phraseWordIndexes.size()][]);
        context.allPhrases.tf = phraseTf.toArray();
        context.allPhrases.tfByDocument = (int[][])phraseTfByDocumentList.toArray((T[])new int[phraseTfByDocumentList.size()][]);
    }

    private List<Substring> discoverRcs(int[] suffixArray, int[] lcpArray, int[] documentIndexArray) {
        Substring[] rcsStack = new Substring[lcpArray.length];
        int sp = -1;
        int i = 1;
        ArrayList<Substring> result = new ArrayList<Substring>();
        while (i < lcpArray.length - 1) {
            int currentSuffixIndex = suffixArray[i];
            int currentDocumentIndex = documentIndexArray[currentSuffixIndex];
            int currentLcp = Math.min(8, lcpArray[i]);
            if (sp < 0) {
                if (currentLcp >= 2) {
                    int length = currentLcp;
                    for (int j = length - 2; j >= 0; --j) {
                        rcsStack[++sp] = new Substring(i, currentSuffixIndex, currentSuffixIndex + currentLcp - j, j == 0 ? 2 : 1);
                        rcsStack[sp].tfByDocument = new IntIntHashMap();
                        rcsStack[sp].tfByDocument.put(documentIndexArray[suffixArray[i - 1]], 1);
                        if (j == 0) {
                            rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex, 1, 1);
                            continue;
                        }
                        rcsStack[sp].documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];
                    }
                }
                ++i;
                continue;
            }
            Substring r = rcsStack[sp];
            if (r.to - r.from < currentLcp) {
                Substring r1 = rcsStack[sp];
                r1.documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];
                int length = currentLcp - (r1.to - r1.from);
                for (int j = length - 1; j >= 0; --j) {
                    if (currentLcp - j < 2) continue;
                    rcsStack[++sp] = new Substring(i, currentSuffixIndex, currentSuffixIndex + currentLcp - j, j == 0 ? 2 : 1);
                    rcsStack[sp].tfByDocument = new IntIntHashMap();
                    rcsStack[sp].tfByDocument.put(documentIndexArray[suffixArray[i - 1]], 1);
                    if (j == 0) {
                        rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex, 1, 1);
                        continue;
                    }
                    rcsStack[sp].documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];
                }
                ++i;
                continue;
            }
            Substring r1 = rcsStack[sp];
            if (r1.to - r1.from == currentLcp) {
                ++rcsStack[sp].frequency;
                rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex, 1, 1);
                ++i;
                continue;
            }
            do {
                if (rcsStack[sp].tfByDocument.size() >= this.dfThreshold) {
                    result.add(rcsStack[sp]);
                }
                Substring s = rcsStack[sp];
                if (--sp < 0) continue;
                rcsStack[sp].frequency += s.frequency - 1;
                PhraseExtractor.addAllWithOffset(rcsStack[sp].tfByDocument, s.tfByDocument, rcsStack[sp].documentIndexToOffset);
            } while (sp >= 0 && rcsStack[sp].to - rcsStack[sp].from > currentLcp);
        }
        return result;
    }

    private static void addAllWithOffset(IntIntHashMap dest, IntIntHashMap src, int documentIndexToOffset) {
        for (IntIntCursor c : src) {
            int key = c.key;
            int value = c.value + (key != documentIndexToOffset ? 0 : -1);
            dest.putOrAdd(key, value, value);
        }
    }
}

