2 anni fa · 8301bc0d03
--- a/bert/search.py
+++ b/bert/search.py
@@ -0,0 +1,5 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @File  : search.py
			
 
				+# @Author: kane
			
 
				+# @Date  : 2021/7/15
			
 
				+# @Desc  :bert自带的相识度判断
			
--- a/es/create_documents.py
+++ b/es/create_documents.py
@@ -0,0 +1,63 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Apr 16 14:54:31 2021
			
 
				+
			
 
				+@author: kane
			
 
				+"""
			
 
				+# /home/Anaconda3/bin/es-serving-start -model_dir /home/NLP_bert/chinese_roberta_wwm_ext_L-12_H-768_A-12   -max_seq_len=120  -max_batch_size=30
			
 
				+import argparse
			
 
				+import json
			
 
				+
			
 
				+import pandas as pd
			
 
				+from bert_serving.client import BertClient
			
 
				+
			
 
				+server_ip = "192.168.20.68"
			
 
				+bc = BertClient(ip=server_ip, output_fmt='list')
			
 
				+
			
 
				+
			
 
				+def create_document(doc, emb, index_name):
			
 
				+    return {
			
 
				+        '_op_type': 'index',
			
 
				+        '_index': index_name,
			
 
				+        'phenomenon': doc["phenomenon"],
			
 
				+        'phenomenon_vector': emb
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def load_dataset(path):
			
 
				+    docs = []
			
 
				+    df = pd.read_csv(path)
			
 
				+    for row in df.iterrows():
			
 
				+        series = row[1]
			
 
				+        doc = {
			
 
				+            'phenomenon': series.phenomenon,
			
 
				+        }
			
 
				+        docs.append(doc)
			
 
				+    return docs
			
 
				+
			
 
				+
			
 
				+def bulk_predict(docs, batch_size=256):
			
 
				+    """Predict es embeddings."""
			
 
				+    for i in range(0, len(docs), batch_size):
			
 
				+        batch_docs = docs[i: i + batch_size]
			
 
				+        breakdown_show_embeddings = bc.encode([doc['phenomenon'] for doc in batch_docs])
			
 
				+        for emb in breakdown_show_embeddings:
			
 
				+            yield emb
			
 
				+
			
 
				+
			
 
				+def main(args):
			
 
				+    docs = load_dataset(args.data)
			
 
				+    with open(args.save, 'w') as f:
			
 
				+        for doc, emb in zip(docs, bulk_predict(docs)):
			
 
				+            d = create_document(doc, emb, args.index_name)
			
 
				+            f.write(json.dumps(d) + '\n')
			
 
				+
			
 
				+
			
 
				+# bert转换的结果写到文件中
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser(description='Creating elasticsearch documents.')
			
 
				+    parser.add_argument('--data', default='phenomenon.csv', help='data for creating documents.')
			
 
				+    parser.add_argument('--save', default='documents.jsonl', help='created documents.')
			
 
				+    parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
			
 
				+    args = parser.parse_args()
			
 
				+    main(args)
			
--- a/es/create_index.py
+++ b/es/create_index.py
@@ -0,0 +1,90 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Apr 16 14:54:31 2021
			
 
				+
			
 
				+@author: kane
			
 
				+"""
			
 
				+import argparse
			
 
				+
			
 
				+from elasticsearch import Elasticsearch
			
 
				+
			
 
				+# http://192.168.20.72:9200/
			
 
				+# 链接es服务
			
 
				+
			
 
				+
			
 
				+host = '192.168.20.69:30920'
			
 
				+es = Elasticsearch([host])
			
 
				+
			
 
				+
			
 
				+def create(index, body=None):
			
 
				+    """
			
 
				+    创建索引
			
 
				+    :param body:
			
 
				+    :param index: 索引名称
			
 
				+    :return: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'student1'}
			
 
				+    """
			
 
				+    if es.indices.exists(index=index):
			
 
				+        es.indices.delete(index=index)  # 删除索引
			
 
				+    res = es.indices.create(index=index, body=body)
			
 
				+    return res
			
 
				+
			
 
				+
			
 
				+def delete(index):
			
 
				+    """
			
 
				+    删除索引
			
 
				+    :param index: 索引名称
			
 
				+    :return: True 或 False
			
 
				+    """
			
 
				+    if not es.indices.exists(index):
			
 
				+        return False
			
 
				+    else:
			
 
				+        res = es.indices.delete(index=index)
			
 
				+        return res['acknowledged']
			
 
				+
			
 
				+
			
 
				+def add(index, body, id=None):
			
 
				+    """
			
 
				+    (单条数据添加或更新)添加或更新文档记录，更新文档时传对应的id即可
			
 
				+    使用方法：
			
 
				+    `
			
 
				+    body = {"name": "long", "age": 11,"height": 111}
			
 
				+    add(index=index_name,body=body)
			
 
				+    或
			
 
				+    body = {"name": "long", "age": 11,"height": 111}
			
 
				+    add(index=index_name,body=body,id=1)
			
 
				+    `
			
 
				+    :param index: 索引名称
			
 
				+    :param body:文档内容
			
 
				+    :param id: 是否指定id,如不指定就会使用生成的字符串
			
 
				+    :return:{'_index': 'student1', '_type': '_doc', '_id': 'nuwKDXIBujABphC4rbcq', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
			
 
				+    """
			
 
				+    res = es.index(index=index, body=body, id=id)
			
 
				+    return res['_id']  # 返回 id
			
 
				+
			
 
				+
			
 
				+def search(index=None):
			
 
				+    """
			
 
				+    查询记录：如果没有索引名称的话默认就会查询全部的索引信息
			
 
				+    :param index:查询的索引名称
			
 
				+    :return:
			
 
				+    """
			
 
				+    if not index:
			
 
				+        return es.search()
			
 
				+    else:
			
 
				+        return es.search(index=index)
			
 
				+
			
 
				+
			
 
				+def main(args):
			
 
				+    with open(args.index_file) as index_file:
			
 
				+        source = index_file.read().strip()
			
 
				+        print(source)
			
 
				+        create(index=args.index_name, body=source)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # parser = argparse.ArgumentParser(description='Creating elasticsearch index.')
			
 
				+    # parser.add_argument('--index_file', default='index.json', help='Elasticsearch index file.')
			
 
				+    # parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
			
 
				+    # args = parser.parse_args()
			
 
				+    # main(args)
			
 
				+    delete('fault_meter')
			
--- a/es/documents.jsonl
+++ b/es/documents.jsonl
--- a/es/find.py
+++ b/es/find.py
@@ -0,0 +1,56 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+es余弦相似度搜索
			
 
				+"""
			
 
				+from bert_serving.client import BertClient
			
 
				+
			
 
				+server_ip = "192.168.20.68"
			
 
				+bc = BertClient(server_ip)
			
 
				+
			
 
				+from elasticsearch import Elasticsearch
			
 
				+
			
 
				+host = '192.168.20.69:30920'
			
 
				+es = Elasticsearch([host])
			
 
				+
			
 
				+
			
 
				+def findRelevantHits(inQuiry):
			
 
				+    global response
			
 
				+    inQuiry_vector = bc.encode([inQuiry])[0].tolist()
			
 
				+    queries = {
			
 
				+        'es': {
			
 
				+            "script_score": {
			
 
				+                "query": {
			
 
				+                    "match_all": {}
			
 
				+                },
			
 
				+                "script": {
			
 
				+                    "source": "cosineSimilarity(params.inQuiry_vector, doc['phenomenon_vector'])",
			
 
				+                    "params": {
			
 
				+                        "inQuiry_vector": inQuiry_vector
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        },
			
 
				+        'mlt': {
			
 
				+            "more_like_this": {
			
 
				+                "fields": ["phenomenon"],
			
 
				+                "like": inQuiry,
			
 
				+                "min_term_freq": 1,
			
 
				+                "max_query_terms": 50,
			
 
				+                "min_doc_freq": 1
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    result = {'es': [], 'mlt': []}
			
 
				+
			
 
				+    for metric, query in queries.items():
			
 
				+        if metric == 'es':
			
 
				+            body = {"min_score": 0.9, "query": query, "size": 10, "_source": {"includes": ["phenomenon"]}}
			
 
				+            response = es.search(index='fault_meter', body=body)
			
 
				+        result[metric] = [a['_source']['phenomenon'] for a in response['hits']['hits']]
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+inQuery = "后备式UPS当负载接近满载时，"
			
 
				+result = findRelevantHits(inQuery.strip().lower())
			
 
				+print(result)
			
--- a/es/index.json
+++ b/es/index.json
@@ -0,0 +1,16 @@
 
				+{
			
 
				+  "settings": {
			
 
				+    "number_of_shards": 5,
			
 
				+    "number_of_replicas": 1
			
 
				+  },
			
 
				+  "mappings": {"properties": {
			
 
				+	  "phenomenon":{
			
 
				+          "type": "text"
			
 
				+	},
			
 
				+       "phenomenon_vector": {
			
 
				+        "type": "dense_vector",
			
 
				+        "dims": 768
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/es/index_documents.py
+++ b/es/index_documents.py
@@ -0,0 +1,29 @@
 
				+"""
			
 
				+Example script to index elasticsearch documents.
			
 
				+"""
			
 
				+import argparse
			
 
				+import json
			
 
				+
			
 
				+from elasticsearch import Elasticsearch
			
 
				+from elasticsearch.helpers import bulk
			
 
				+
			
 
				+
			
 
				+def load_dataset(path):
			
 
				+    with open(path) as f:
			
 
				+        return [json.loads(line) for line in f]
			
 
				+
			
 
				+
			
 
				+def main(args):
			
 
				+    host = '192.168.20.69:30920'
			
 
				+    es = Elasticsearch([host])
			
 
				+    # 往es索引写数据，docs是一个json集合数据集
			
 
				+    docs = load_dataset(args.data)
			
 
				+    bulk(es, docs, index=args.index_name, chunk_size=1000, request_timeout=120)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser(description='Indexing elasticsearch documents.')
			
 
				+    parser.add_argument('--data', default='documents.jsonl', help='Elasticsearch documents.')
			
 
				+    parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
			
 
				+    args = parser.parse_args()
			
 
				+    main(args)
			
--- a/es/phenomenon.csv
+++ b/es/phenomenon.csv
@@ -0,0 +1,10 @@
 
				+phenomenon
			
 
				+有市电时 UPS输出正常，而无市电时蜂鸣器长鸣，无输出
			
 
				+蓄电池电压偏低，但开机充电十多小时，蓄电池电压仍充不上去
			
 
				+"UPS开机后，面板上无任何显示,UPS不工作"
			
 
				+在接入市电的情况下，每次打开 UPS，便听到继电器反复的动作声， UPS面板电池电压过低指示灯长亮且蜂鸣器长鸣
			
 
				+一台后备 UPS有市电时工作正常， 无市电时逆变器有输出， 但输出电压偏低，同时变压器发出较大的噪音
			
 
				+在市电供电正常时开启 UPS，逆变器工作指示灯闪烁， 蜂鸣器发出间断叫声，UPS只能工作在逆变状态，不能转换到市电工作状态
			
 
				+后备式 UPS当负载接近满载时，市电供电正常，而蓄电池供电时蓄电池保险丝熔断
			
 
				+UPS只能由市电供电而不能转为逆变供电
			
 
				+空载通电，不能转市电，也无逆变，机器无任何运作，蜂鸣器也无响声
			
--- a/es/requirements.txt
+++ b/es/requirements.txt
@@ -0,0 +1,3 @@
 
				+bert-serving-client==1.9.8
			
 
				+elasticsearch==7.0.4
			
 
				+pandas==0.25.1
			
--- a/es/run.py
+++ b/es/run.py
@@ -0,0 +1,43 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Apr 21 09:17:19 2021
			
 
				+创建索引。处理文本获取文本向量，文本添加至索引
			
 
				+@author: kane
			
 
				+"""
			
 
				+import argparse
			
 
				+import create_documents
			
 
				+import index_documents
			
 
				+import create_index
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    try:
			
 
				+        parser = argparse.ArgumentParser(description='Creating elasticsearch index.')
			
 
				+        parser.add_argument('--index_file', default='index.json', help='Elasticsearch index file.')
			
 
				+        parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
			
 
				+        args = parser.parse_args()
			
 
				+        create_index.main(args)
			
 
				+    except:
			
 
				+        print('索引创建失败')
			
 
				+    try:
			
 
				+        parser = argparse.ArgumentParser(description='Creating elasticsearch documents.')
			
 
				+        parser.add_argument('--data', default='phenomenon.csv', help='data for creating documents.')
			
 
				+        parser.add_argument('--save', default='documents.jsonl', help='created documents.')
			
 
				+        parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
			
 
				+        args = parser.parse_args()
			
 
				+        create_documents.main(args)
			
 
				+    except:
			
 
				+        print('文本处理失败')
			
 
				+    try:
			
 
				+        parser = argparse.ArgumentParser(description='Indexing elasticsearch documents.')
			
 
				+        parser.add_argument('--data', default='documents.jsonl', help='Elasticsearch documents.')
			
 
				+        parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
			
 
				+        args = parser.parse_args()
			
 
				+        index_documents.main(args)
			
 
				+    except:
			
 
				+        print('文本添加至索引失败')
			
 
				+    print('完成任务')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()