from numpy import *
from math import *
from rpy import *

#additional libraries are imported, rpy makes R functions usable for Python, numpy allows Python to work with matrices

filename="***LPhrMk-153***" #replace by the name of your input data text file 

def readt(f): #reads text file
    s=[]
    arow=[]
    try:
        f = file('***C:/Python25/***'+f+".txt",'r') #Replace by the path of the folder where the input text file is saved
        acol = f.readline().split()
        i = f.readline()
        while i != "":
            ii=i.split()
            s += [ii[2:]]
            arow+=[ii[1]+str(ii[0])]
            i = f.readline()
        f.close()
    except:
        print "File", f, "not found"
    return s,arow,acol

def writeforr(f,aa,acol,arow): #saves the input data text file for R
    try:
        f = file('***C:/Programme/R/R-2.5.1/***'+f+'-fuerR.txt','w')  #specify the exact path of your R folder
        tt=" "
        for i in range(len(acol)):
            tt+=acol[i]+" "
        tt=tt[:-1]+"\n"
        f.write(tt)
        for i in range(len(aa)):
            tt=""
            tt+=arow[i]+" "
            for j in range(len(acol)):
                tt+=aa[i][j]+" "
            tt=tt[:-1]+"\n"
            f.write(tt)  
        f.close()
    except:
        print "Could not save"

def writercode(f,p1): #saves the text file containing the R code
    try:
        f = file('C:/Python25/'+f+'-langlist.txt','w') #replace by the path of the folder where this file should be saved
        f.write(p1+"\n")
        f.close()
    except:
        print "Could not save"

def hammingdist(s1,s2,data): #calculates the distance for the distance matrix in three different ways
    l=len(data[0])
    diffand=0;diffor=0;diffmix=float(0)
    avail=l
    for i in range(l):
        if data[s1][i]=="NA" or data[s2][i]=="NA" or data[s1][i]=="_" or data[s2][i]=="_" or data[s1][i]=="?" or data[s2][i]=="?": #not attested can be "_",  
"?" or "NA" (upper case only)
            avail-=1
        elif data[s1][i]==data[s2][i]:
            diffand+=1;diffor+=1;diffmix+=1
        else:
            mix=0
            sb1=data[s1][i].split("=");sb2=data[s2][i].split("=")
            for j in range(len(sb1)):
                for k in range(len(sb2)):
                    if sb1[j]==sb2[k]:
                        mix=1
            if mix==1:
                diffor+=1;diffmix+=0.5
    return float(1)-(float(diffand)/avail),float(1)-(float(diffor)/avail), float(1)-(float(diffmix)/avail)

def simlines(data): #calculates the three distance matrices
    n=len(data)
    resultand=zeros((n,n),dtype=float)
    resultor=zeros((n,n),dtype=float)
    resultmix=zeros((n,n),dtype=float)
    for i in range(n):
        for j in range(0,i):
            resultand[i][j],resultor[i][j],resultmix[i][j]=hammingdist(i,j,data)
            resultand[j][i]=resultand[i][j];resultor[j][i]=resultor[i][j];resultmix[j][i]=resultmix[i][j]
    return resultand,resultor,resultmix

def spalte(a):
    return(a[0][a[1]])

aa,arow,acol=readt(filename) #reads the data from a text file to aa, and the rownames and column names to arow and acol

rowl=len(aa[0]) #some rough checks whether the data has the form it should have, in case of an Error the file must be corrected
for i in range(len(aa)):
    if len(aa[i]) != rowl:
        print "Length of Row Error in Line",i,":",arow[i],"  ",len(aa[i]),"Items"
if len(acol) != rowl:
    print "Length of Colnames Error"
if len(arow) != len(aa):
    print "Length of Rownames Error"

aband,abor,abmix=simlines(aa)
print aband #Prints a small section of the matrix, not necessary, just to show that the program has done something, the diagonal should be zero
r.write_table(aband,"***C:/Programme/R/R-2.5.1/***"+filename+"-whole.txt") #writes a text file with the similarity matrix for partially similar = different
r.write_table(abor,"***C:/Programme/R/R-2.5.1/***"+filename+"-wholeor.txt") #same with partially similar = identical
r.write_table(abmix,"***C:/Programme/R/R-2.5.1/***"+filename+"-wholemix.txt") #same with partially similar halfway (0.5) counted
r.write_table(arow,"***C:/Programme/R/R-2.5.1/***"+filename+"-rownames.txt") #writes a text file with rownames
r.write_table(acol,"***C:/Programme/R/R-2.5.1/***"+filename+"-colnames.txt") #writes a text file with column names
# replace the path by the path of your R folder

langall=[] #a list of language names and categories to be mapped is made
for i in range(len(aa[0])):
    acholi = map(spalte,zip(aa,[i]*len(aa))) #the variable name is accidental, Acholi is the first language in the alphabet
    acholi2 = dict.fromkeys(acholi) #makes a dictionary of all items
    acholistr=str(acholi) 
    for j in acholi2: #counts tokens for every item
        k=acholistr.count("'"+j+"'")
        acholi2[j] = k #assigns number of tokens to the dictionary
    acholi2["NA"]=0
    acholi2["_"]=0
    acholi2["?"]=0 #values for unattested types are set to zero i.e. below threshold
    acholi3=dict.items(acholi2)
    acholi4=[]
    for j in range(len(acholi3)):
        if acholi3[j][1] > 1: #threshold is 2, all items with more than 1 token are retained
            acholi4+=[[acholi3[j][1],acholi3[j][0]]] #order reversed for sorting
    acholi4.sort(reverse=True) #sorts the list
    acholi4=acholi4[:11] #limits the number of items in the list to eleven
    acholi5=[] #list of maximally 11 types sorted according to frequency
    for j in range(len(acholi4)):
        acholi5+=[acholi4[j][1]]
    langall+=[[[acol[i]],acholi5]]#language name plus list is added

t="" #t is a string that will contain the R code 
t+="data<-read.table(\""+filename+"-fuerR.txt\")\n"
t+="matr<-read.table(\""+filename+"-wholemix.txt\")\n" #the R code pre-specifies the halfway identical count, if "wholemix" 
# is changed to "wholeor" or "wholeand" here or in the R code text file, one of the other distance matrices is used for the plots
t+="coln<-read.table(\""+filename+"-colnames.txt\")[,1]\n"
t+="rown<-read.table(\""+filename+"-rownames.txt\")[,1]\n"
t+="rown->rownames(matr)->colnames(matr)\n"
t+="write.table(matr,\""+filename+"-wholemix.txt\")\n"
t+="cmd<-cmdscale(matr,k=20)\n" #saves file with row and colnames
t+="x<-cmd[,1];y<-cmd[,2]\n"
t+="plot(x,y,col=\"white\"); text(x,y,rown,cex=.6)\n" #plots dimensions 1 and 2 with rowname labels

t+="category<-function(u,v,w,c){\n"
t+="for(i in 1:length(u)) points(x[u[i]],y[u[i]],col=v, cex=c, pch=w)\n"
t+="return()}\n" #a function category in R is defined which plots points of u in x and y with defined color, size and shape

#color, shape and size symbols for the plots in R are determined arbitrarily, too many actually, 11 would be enough
col1=["#FF0000","#0000FF","#00FF00","#FFAA00","#00AA33","#AA00AA","#774400","#888888","#000000","#BB8800","#00FFFF","#FFFF00","#8855FF","#993300","#888888"," 
#333333"] #defines the colors
shape1=[15,16,17,18,15,16,17,18,15,16,17,18,15,16,17,18,15,16,17,18] #defines the shape of the symbols
size1=[1,1.5,1.5,1.5,1,1.5,1.5,1.5,1,1.5,1.5,1.5,1,1.5,1.5,1.5,1,1.5,1.5,1.5] #defines the size of the symbols

for i in range(len(langall)):
    langlen=len(langall[i][1])
    t+="h<-data[,rep("+str(i+1)+",2)]\n"
    t+="h[,2]<-c(1:"+str(len(aa))+")\n"
    t+="split(h,h[,1])->g\n"
    t+="plot(x,y, main=\""+langall[i][0][0]+"\",xlab=\"Dimension 1\",ylab=\"Dimension 2\",cex=.5)\n"

    for j in range(langlen):
        t+="category(g$\""+langall[i][1][j]+"\"[,2],\""+col1[j]+"\","+str(shape1[j])+","+str(size1[j])+")\n"
    t+="legend(min(x),max(y),legend=c(" #the legend is positioned in the upper left corner, if this turns out to be inconvenient
#replace all instances of "min(x),max(y)" by the coordinates where the legend should appear in the R code
    for j in range(langlen):
        t+="\""+langall[i][1][j]+"\","
    t=t[:-1]
    t+="),col=c("
    for j in range(langlen):
        t+="\""+col1[j]+"\","
    t=t[:-1]
    t+=")," 
    t+="pch=c("
    for j in range(langlen):
        t+=str(shape1[j])+","
    t=t[:-1]
    t+="),bty=\"n\",pt.cex=c("
    for j in range(langlen):
        t+=str(size1[j])+","
    t=t[:-1]
    t+="))\n"

writercode(filename,t) #writes the R code to a text file 
writeforr(filename,aa,acol,arow) #writes the original data to a text file that is readable for R