#!/bin/sh
# makealias: make alias compressed dic and aff files
# Author: László Németh
# Version: 2019-05-10

export LC_ALL=C

usage()
{
echo 'makealias: make alias compressed dic and aff files for faster load and smaller memory footprint
Usage: makealias [--minimize-diff old_file_without_file_extension] file.dic file.aff' >/dev/stderr
exit
}

filecheck()
{
  if [ ! -f $1 ]; then
    echo File not found: $1
    exit
  fi
}

case $# in
0|1|3) usage;;
esac

if [ "$1" = "--minimize-diff" ]; then
  filecheck $2.aff
  AWK_ARG="-v mindiff=$2.aff"
  shift; shift
fi

case $1 in
*.dic) filecheck $1;;
*) usage;;
esac

case $2 in
*.aff) filecheck $2;;
*) usage;;
esac

DIC=`basename $1 .dic`
AFF=`basename $2 .aff`

# Alias definitions must be between FLAG type definition and SFX/PFX lines.
# copy aff content before SFX/PFX lines
sed '/^[SP]FX/,$d;s/\s\+$//' $2 >"${AFF}_alias.aff"

# space conversion (space separated multi-word expressions
# are allowed in the dictionary)
cat $1 | sed 's/ /_/g' > "${DIC}_alias.tmp.dic"

awk $AWK_ARG '
BEGIN{
  n=1;m=1
  # make smaller and more readable diff for LibreOffice git
  if (mindiff != "") {
      n=0;m=0
      # load old AF and AM fields
      while ((getline var < mindiff) > 0) {
          if (var~/^AF /) {
                  b[n] = gensub(" #.*$", "", 1, substr(var, 4))
                  a[b[n]] = n
                  n = n + 1
          }
          if (var~/^AM /) {
                  c[m] = substr(var, 4)
                  a2[c[m]] = m
                  m = m + 1
          }
     }
     # reuse unused AF and AM items
     # first load dic
     while ((getline $0 < ARGV[1]) > 0) {
       if (match($0, "^([^ \t/]+)(/([^ \t]*))?([ \t]*(.*))?", flds)) {
         if (length(flds[3]) > 0) {
             if(a[flds[3]]) {
                 used[flds[3]] = 1
             } else if (!not_used_registered[flds[3]]) {
                 not_used[un] = flds[3]
                 not_used_registered[flds[3]] = 1
                 un++
             }
         }
         if (length(flds[5]) > 0) {
                 mfld = ltrim(flds[5])
                 if(a2[mfld] || not_used2[mfld]) {
                     used2[mfld] = 1
                 } else if (!not_used2_registered[mfld]) {
                     not_used2[um] = mfld
                     not_used2_registered[mfld] = 1
                     um++
                 }
         }
       }
     }
     # load aff file
     while ((getline $0 < ARGV[2]) > 0) {
       if ($0 ~ /^[PS]FX/ && NF > 4) {
         if ($4 ~ /\/[^  \t]/) {
            split($4,t,"/")
            if(a[t[2]]){
                 used[t[2]] = 1
            } else {
                 not_used[un++] = t[2]
            }
         }
         if ($6 != "") {
           $1 = $2 = $3 = $4 = $5 = ""
           $0 = ltrim($0)
           if(a2[$0]) {
             used2[$0] = 1
           } else {
             not_used2[um++] = $0
           }
         }
       }
     }
     # reuse unused items
     j = 0
     for (i=0; i < n && j < un; i++) {
        if (!(b[i] in used)) {
           b[i] = not_used[j]
           a[b[i]] = i
           used[b[i]] = 1
           j++
        }
     }
     j = 0
     for (i=0; i < m && j < um; i++) {
        if (!(c[i] in used2)) {
           c[i] = not_used2[j]
           a2[c[i]] = i
           used2[c[i]] = 1
           j++
        }
     }
  }
}
function cutslash(st) {
    if (split(st,t,"/") > 1) return t[1]
    return st
}
function ltrim(st) {
    sub(/^ +/,"",st)
    gsub("[ \t]+"," ",st)
    return st
}

FILENAME ~ /.dic$/ && match($0, "^([^ \t/]+)(/([^ \t]*))?([ \t]*(.*))?", flds) {
    # flag vector?
    flags = ""
    if(length(flds[3]) > 0) {
        if (!a[flds[3]]) {
            a[flds[3]]=n
            b[n]=flds[3]
            n++
        }
        flags = "/" a[flds[3]]
    }
    end = ""
    # morphological fields?
    if (length(flds[5]) > 0) {
        mfld = ltrim(flds[5])
        if(!a2[mfld]){
            a2[mfld]=m
            c[m]=mfld
            m++
        } 
        end = "\t" a2[mfld]
    }
    print flds[1] flags end
    next
}

FILENAME ~ /.aff$/ && /^[PS]FX/ && NF > 4 {
    if ($4 ~ /\/[^  \t]/) {
        split($4,t,"/")
        if(!a[t[2]]){
            a[t[2]]=n
            b[n]=t[2]
            n++
        }
        begin = $1 " " $2 " " $3 " " (t[1]"/"a[t[2]]) " " $5
    } else {
        begin = $1 " " $2 " " $3 " " cutslash($4) " " $5
    }
    end = ""
    if ($6 != "") {
        $1 = $2 = $3 = $4 = $5 = ""
        $0 = ltrim($0)
        if(!a2[$0]) {
            a2[$0]=m
            c[m]=$0
            m++
        }
        end = " " a2[$0]
    }
    print begin end >>"/dev/stderr"
    next
}
FILENAME ~ /.aff$/ { print $0 >>"/dev/stderr" }
END {
    if (n>1) {
        print "AF", n-1 >>"'${AFF}_alias.aff'"
        for(i=1;i<n;i++) print "AF", b[i],"#",i >>"'${AFF}_alias.aff'"
    }
    if (m>1) {
        print "AM", m-1 >>"'${AFF}_alias.aff'"
        for(i=1;i<m;i++) print "AM " ltrim(c[i]) >>"'${AFF}_alias.aff'"
    }
}' "${DIC}_alias.tmp.dic" $2 >"${DIC}_alias.tmp" 2>"${AFF}_alias.$$"
cat "${DIC}_alias.tmp" | sed 's/_/ /g' > "${DIC}_alias.dic"
sed -n '/^[SP]FX/,$p' "${AFF}_alias.$$" | sed 's/\s\+$//' >>"${AFF}_alias.aff"
echo "output: ${DIC}_alias.dic, ${AFF}_alias.aff"

rm ${DIC}_alias.tmp ${DIC}_alias.tmp.dic ${AFF}_alias.$$
