|
- # coding=utf-8
- import numpy as np
- import configparser
- from datetime import datetime
- import json
-
- from tools.logger import Logger
-
-
- LTHRESHOLD = 0.4
- MTHRESHOLD = 0.6
- HTHRESHOLD = 0.8
- DB = 'xy-cloud1'
- INDEX = 'dm_q_and_a,dm_questions'
- cache = {}
-
-
- class Knowledge():
- def __init__(self, model):
- self.model = model + 'llm'
- self.config = configparser.ConfigParser()
- self.config.read("settings.ini", encoding="utf-8")
-
- self.sourcename = self.config.get('config', 'source') + 'source'
- source_config = dict(self.config.items(self.sourcename))
- source_config['index'] = INDEX
- source_config['db'] = DB
- self.sources = __import__('sources.%s' % self.sourcename,
- fromlist=['sources'])
- string = 'self.sources.' + self.sourcename.capitalize()
- self.source = eval(string)(**source_config)
-
- self.logger = Logger(self.config.get("config", "logger_path"))
-
- self.emname = self.config.get('config', 'embedding') + 'embedding'
- embedding_config = dict(self.config.items(self.emname))
- self.ems = __import__('embeddings.%s' % self.emname,
- fromlist=['embeddings'])
- string = 'self.ems.' + self.emname.capitalize()
- self.embedding = eval(string)(**embedding_config)
-
- self.llmname = self.model
- llm_config = dict(self.config.items(self.llmname))
- self.llms = __import__('llms.%s' % self.llmname,
- fromlist=['llms'])
- string = 'self.llms.' + self.llmname.capitalize()
- self.llm = eval(string)(**llm_config)
-
- def combine(self, result, question, accurate, tenantId):
- Result = {'code': 200, 'target': 1}
- data = {}
- if not result == '':
- data['result'] = result
- data['question'] = question
- data['accurate'] = accurate
- data['llm'] = self.config.get('config', 'model')
- data['tenantId'] = tenantId
- Result['data'] = data
- return Result
-
- def log_header(self, method):
- timestamp = datetime.strftime(datetime.now(), '%Y-%m-%dT%H:%M:%S:%f')
- timestamp = '"timestamp":"' + timestamp + '",'
- tenant = '"tentant_id":' + str(self.tenant_id) + ','
- em = '"embedding":"' + self.emname + '",'
- model = '"model":"' + self.llmname + '",'
- source = '"source":"' + self.sourcename + '",'
- questionstr = '"question":"' + self.question + '","answers":['
- logs = ',"method":' + method + ','
- logs = timestamp + tenant + em + model + source + logs + questionstr
- return logs
-
- def emsearch(self):
- data_all = self.source.getdata(tenant_id=self.tenant_id)
- logs = self.log_header('"emlist"')
- for i in data_all:
- item = '"' + i['name'] + '",' + i['id']
- logs = logs + '[' + item + '],'
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
-
- em_list = []
- score_em_max = -2
- if len(data_all) > 0:
- all_em_score = np.zeros(len(data_all))
- v1 = self.embedding.getem(self.question)
- for i in range(len(data_all)):
- if data_all[i]['name'] in cache.keys():
- v2 = cache[data_all[i]['name']]
- else:
- v2 = self.embedding.getem(data_all[i]['name'])
- cache[data_all[i]['name']] = v2
- numerator = np.dot(v1, v2)
- denominator = (np.linalg.norm(v1) * np.linalg.norm(v2))
- all_em_score[i] = numerator / denominator
- logs = self.log_header('"emsearch"')
- for i in range(6):
- t = np.argmax(all_em_score)
- if all_em_score[t] < LTHRESHOLD:
- break
- if i == 0:
- score_em_max = all_em_score[t]
- if all_em_score[t] <= -1:
- break
- em_list.append(data_all[t])
- item = '"' + data_all[t]['name'] + '",'
- item = item + data_all[t]['id']
- item = item + ',%.3f' % all_em_score[t]
- logs = logs + '[' + item + '],'
- all_em_score[t] = -2
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
- return [em_list, score_em_max]
-
- def recommend(self):
- logs = self.log_header('"searchbegin"')
- self.logger.info(logs + ']')
- [result_list, score_em_max] = self.emsearch()
- if score_em_max > HTHRESHOLD:
- result_list[0]['highlight'] = 1
- logs = self.log_header('"recommend"')
- for i in result_list:
- logs = logs + '["' + i['name'] + '",' + i['id'] + '],'
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
- return self.combine(result_list[0:4],
- self.question,
- 1,
- self.tenant_id)
- if score_em_max > MTHRESHOLD:
- result_list[0]['highlight'] = 1
-
- logs = self.log_header('"recommendlist"')
- for i in result_list:
- logs = logs + '["' + i['name'] + '",' + i['id'] + '],'
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
- if len(result_list) == 0:
- logs = self.log_header('"recommend"')
- self.logger.info(logs + ']')
- return self.combine("",
- self.question,
- 0,
- self.tenant_id)
- if len(result_list) <= 4:
- logs = self.log_header('"recommend"')
- for i in result_list:
- logs = logs + '["' + i['name'] + '",' + i['id'] + '],'
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
- return self.combine(result_list,
- self.question,
- 0,
- self.tenant_id)
-
- if self.config.get('config', 'usellm') == "0":
- logs = self.log_header('"recommend"')
- for i in result_list:
- logs = logs + '["' + i['name'] + '",' + i['id'] + '],'
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
- return self.combine(result_list[0:4],
- self.question,
- 0,
- self.tenant_id)
- L = ''
- for i in result_list:
- L = L + '"' + i['name'] + '",'
- L = L[0:-1]
- Q1 = "请从列表{"
- Q2 = "}选出与'"
- Q3 = "'意图最接近的四句话,将结果以{question1:,question2:,question3:,question4:}输出。"
- Q4 = '输出为JSON格式。答案只能来自于列表。不要返回代码。不要输出JSON之外的东西'
- Q = Q1 + L + Q2 + self.question + Q3 + Q4
- logs = self.log_header('"llmin"')
- self.logger.info(logs + '"' + Q + '"]')
- answer, tokens = self.llm.link(Q)
- logs = self.log_header('"llmout"')
- tokens = str(tokens).replace("'", '"')
- logs = logs[0:-1] + repr(answer) + ',"tokens":' + str(tokens)
- self.logger.info(logs)
- begin = answer.find('{')
- end = answer.rfind('}')
- answer = answer[begin:end+1]
- answer = answer.replace('\\n', '')
- answer = answer.replace('\\"', '"')
- logs = self.log_header('"llmsearch"')
- answer = answer.replace("'", '"')
- try:
- data = json.loads(answer)
- result = []
- for key in data:
- for i in result_list:
- if data[key] == i['name']:
- if i not in result:
- result.append(i)
- except Exception:
- result = result_list[0:4]
- err_logs = self.log_header('"llmsearch"')
- err_logs = err_logs + '"Can not trans to JSON.]"'
- self.logger.error(err_logs)
- for i in result:
- logs = logs + '["' + i['name'] + '",' + i['id'] + '],'
- while len(result) < 4:
- for i in result_list:
- if i not in result:
- result.append(i)
- break
- if len(result) > 4:
- result = result[0:4]
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
-
- logs = self.log_header('"recommend"')
- for i in result:
- logs = logs + '["' + i['name'] + '",' + i['id'] + '],'
- logs = logs[0:-1] if logs[-1] == ',' else logs
- self.logger.info(logs + ']')
- return self.combine(result,
- self.question,
- 0,
- self.tenant_id)
|