PCLNLP
/
ClinicalNLP

 
			
			   
				 
					
						
						
							
							import os
import sys
import codecs
import pdb
import math

dir = sys.argv[1]
class ranking(object):
  def __init__(self):
    print ("init")
  def output(self):
    fw = codecs.open(dir+"/output.txt","w","utf-8")
    f = codecs.open("./SampleData/medical_literature_retrieval_test_data/test.tsv","r","utf-8")
    datalist =f.read().splitlines()[1:]
    fresult = codecs.open(dir+"/test_results.tsv","r","utf-8")
    resultlist= fresult.read().splitlines()
    assert len(datalist)==len(resultlist)
    for i in range(len(resultlist)):
      fw.write(datalist[i].split("\t")[0]+"\t"+datalist[i].split("\t")[2]+"\t"+resultlist[i].strip()+"\n")
    f.close()
    fresult.close()
  def ranking_file(self):
    print ("ranking_file")
    f = codecs.open(dir+"/output.txt","r","utf-8")
    triple_dict = {}
    datalist = f.read().splitlines()
    for line in datalist:
      kg = line.split("\t")[1]
      if kg not in triple_dict.keys():
        triple_dict[kg] = []
        triple_dict[kg].append(line)
      else:
        triple_dict[kg].append(line)
    fw = codecs.open(dir+"/rank_result.txt","w","utf-8")
    for k in triple_dict.keys():
      triple_value_dict = {}
      for data in triple_dict[k]:
        values = [float(x) for x in data.split("\t")[2:7]]
        maxvalue =max(values)
        maxindex = values.index(maxvalue)
        if maxindex == 0:
          floatv = 5+maxvalue
        elif maxindex == 1:
          floatv = 4+maxvalue
        elif maxindex == 2:
          floatv = 3+maxvalue
        elif maxindex == 3:
          floatv = 2+maxvalue
        elif maxindex == 4:
          floatv = 1+maxvalue
        triple_value_dict[data] = floatv
      output = list(sorted(triple_value_dict.items(),key=lambda asd:asd[1],reverse= True))
      for elem in output:
        fw.write(elem[0]+"\t"+str(elem[1])+"\n")
    fw.close()  
    f.close()
  def p_k(self):
    print ("2.P@k")
    f = codecs.open(dir+"/rank_result.txt","r","utf-8")
    datalist = f.read().splitlines()
    triple_dict = {}
    for line in datalist:
      kg = line.split("\t")[1]
      if kg not in triple_dict.keys():
        triple_dict[kg] = []
        triple_dict[kg].append(line)
      else:
        triple_dict[kg].append(line)
    P_1_mostrel = 0.0
    P_5_mostrel = 0.0
    P_10_mostrel = 0.0    
    P_1_rel = 0.0
    P_5_rel = 0.0
    P_10_rel = 0.0
    for key in triple_dict.keys():
      data = triple_dict[key]
      top_1 =data[0:1]
      top_5 = data[0:5]
      top_10 = data[0:10]
      top_1_cnt = 0.0
      top_1_cnt_rel = 0.0
      top_5_cnt = 0.0
      top_5_cnt_rel = 0.0
      top_10_cnt = 0.0
      top_10_cnt_rel = 0.0
      for line in top_1:
        label = line.split("\t")[0]
        if label == "2":
          top_1_cnt +=1
          top_1_cnt_rel += 1
        elif label == "1":
          top_1_cnt_rel += 1
      P_1_mostrel += top_1_cnt/1.0
      P_1_rel += top_1_cnt_rel/1.0
      for line in top_5:
        label = line.split("\t")[0]
        if label == "2":
          top_5_cnt+=1
          top_5_cnt_rel+=1
        elif label == "1":
          top_5_cnt_rel+=1
      P_5_mostrel += top_5_cnt/5.0
      P_5_rel += top_5_cnt_rel/5.0
      for line in top_10:
        label = line.split("\t")[0]
        if label == "2":
          top_10_cnt+=1
          top_10_cnt_rel+=1
        elif label == "1":
          top_10_cnt_rel+=1
      P_10_mostrel += top_10_cnt/10.0
      P_10_rel += top_10_cnt_rel/10.0
    N = len(triple_dict.keys())
    P_at_1_mostrel = P_1_mostrel/N
    P_at_5_mostrel = P_5_mostrel/N
    P_at_10_mostrel = P_10_mostrel/N
    P_at_1_rel = P_1_rel/N
    P_at_5_rel = P_5_rel/N
    P_at_10_rel = P_10_rel/N
    print ("\tMost_relevant P@1",P_at_1_mostrel,"P@5",P_at_5_mostrel,"P@10",P_at_10_mostrel)
    print ("\tRrelevant P@1",P_at_1_rel,"P@5",P_at_5_rel,"P@10",P_at_10_rel)
  def ndcg_k(self):
    print ("4.NDCG@k")
    f = codecs.open(dir+"/rank_result.txt","r","utf-8")
    datalist = f.read().splitlines()
    triple_dict = {}
    for line in datalist:
      kg = line.split("\t")[1]
      if kg not in triple_dict.keys():
        triple_dict[kg] = []
        triple_dict[kg].append(line)
      else:
        triple_dict[kg].append(line)
    DCG_1 = 0.0
    DCG_5 = 0.0
    DCG_10 = 0.0
    IDCG_1 = 0.0
    IDCG_5 = 0.0
    IDCG_10 = 0.0
    NDCG_1=0.0
    NDCG_5=0.0
    NDCG_10=0.0
    for k in triple_dict.keys():
      DCG_1 = 0.0
      DCG_5 = 0.0
      DCG_10 = 0.0
      IDCG_1 = 0.0
      IDCG_5 = 0.0
      IDCG_10 = 0.0
      data = triple_dict[k]
      top_1 = data[0:1]
      top_5 = data[0:5]
      idealD_5 = [int(line.split("\t")[0]) for line in top_5]
      idealD_5.sort(reverse=True)
      if len(data)>=10:
        top_10 = data[0:10]
      else:
        top_10 = data
      idealD_10 = [int(line.split("\t")[0]) for line in top_10]
      idealD_10.sort(reverse=True)

      for i in range(1):
        label = top_1[i].split("\t")[0]
        DCG_1 += (2**int(label)-1)/(math.log(i+1+1,2))
        IDCG_1 += (2**int(2)-1)/(math.log(i+1+1,2))
      NDCG_1 += DCG_1/IDCG_1
      for i in range(5):
        label = top_5[i].split("\t")[0]
        ilabel = idealD_5[i]
        DCG_5 += (2**int(label)-1)/math.log(i+1+1,2)
        IDCG_5 += (2**int(ilabel)-1)/math.log(i+1+1,2)
      NDCG_5 += DCG_5/(IDCG_5+0.00001)
      for i in range(len(top_10)):
        label = top_10[i].split("\t")[0]
        ilabel = idealD_10[i]
        DCG_10 += (2**int(label)-1)/math.log(i+1+1,2)
        IDCG_10 += (2**int(ilabel)-1)/math.log(i+1+1,2)
      NDCG_10 += DCG_10/(IDCG_10+0.00001)
    N = len(triple_dict.keys())
    _NDCG_1 = NDCG_1/N
    _NDCG_5 = NDCG_5/N
    _NDCG_10 = NDCG_10/N
    print ("\tNDCG@1",_NDCG_1,"NDCG@5",_NDCG_5,"NDCG@10",_NDCG_10)
  def mrr(self):
    f = codecs.open(dir+"/rank_result.txt","r","utf-8")
    print ("1.MRR")
    datalist = f.read().splitlines()
    triple_dict = {}
    for line in datalist:
      kg = line.split("\t")[1]
      if kg not in triple_dict.keys():
        triple_dict[kg] = []
        triple_dict[kg].append(line)
      else:
        triple_dict[kg].append(line)
    RR_mostrelevant = 0.0
    RR_relevant = 0.0
    for k in triple_dict.keys():
      data = triple_dict[k]
      i = 1
      mostrel_flag = False
      rel_flag =False
      for item in data:
        label = item.split("\t")[0]
        if label == "2" and mostrel_flag==False:
          RR_mostrelevant += 1/i
          mostrel_flag =True
        if (label == "2" or label == "1") and rel_flag==False:
          RR_relevant += 1/i
          rel_flag = True
        i = i+1
    N = len(triple_dict.keys())
    MRR_mostrelevant = RR_mostrelevant/N
    MRR_relevant = RR_relevant/N
    print ("\tMRR_mostrelevant",MRR_mostrelevant,"MRR_relevant",MRR_relevant,"N", N)
    f.close()
  def map(self):
    print ("3.MAP")
    f = codecs.open(dir+"/rank_result.txt","r","utf-8")
    datalist = f.read().splitlines()
    triple_dict = {}
    for line in datalist:
      kg = line.split("\t")[1]
      if kg not in triple_dict.keys():
        triple_dict[kg] = []
        triple_dict[kg].append(line)
      else:
        triple_dict[kg].append(line)
    AP_mostrel = 0.0
    AP_rel = 0.0
    for k in triple_dict.keys():
      data = triple_dict[k]
      R = len(data)
      cnt_mostrel = 0.0
      cnt_rel = 0.0
      P_mostrel = 0.0
      P_rel = 0.0
      for i in range(R):
        label = data[i].split("\t")[0]
        if label == "2":
          cnt_mostrel +=1
          cnt_rel += 1
        if label == "1":
          cnt_rel += 1
        P_mostrel += cnt_mostrel/(i+1.0)
        P_rel += cnt_rel/(i+1.0)
      AP_mostrel += P_mostrel/R
      AP_rel += P_rel/R
    N = len(triple_dict.keys())
    MAP_mostrel = AP_mostrel/N
    MAP_rel = AP_rel/N
    print ("\tMAP_mostrelevant",MAP_mostrel,"MAP_relevant",MAP_rel)
if __name__=="__main__":
  obj = ranking()
  obj.output()
  obj.ranking_file()
  obj.mrr() # 
  obj.p_k() # 
  obj.map() # 
  obj.ndcg_k() #