Explorar el Código

故障现象相似度查询

xuYongJian hace 2 años
commit
8301bc0d03
Se han modificado 10 ficheros con 315 adiciones y 0 borrados
  1. 5 0
      bert/search.py
  2. 63 0
      es/create_documents.py
  3. 90 0
      es/create_index.py
  4. 0 0
      es/documents.jsonl
  5. 56 0
      es/find.py
  6. 16 0
      es/index.json
  7. 29 0
      es/index_documents.py
  8. 10 0
      es/phenomenon.csv
  9. 3 0
      es/requirements.txt
  10. 43 0
      es/run.py

+ 5 - 0
bert/search.py

@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @File  : search.py
+# @Author: kane
+# @Date  : 2021/7/15
+# @Desc  :bert自带的相识度判断

+ 63 - 0
es/create_documents.py

@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Apr 16 14:54:31 2021
+
+@author: kane
+"""
+# /home/Anaconda3/bin/es-serving-start -model_dir /home/NLP_bert/chinese_roberta_wwm_ext_L-12_H-768_A-12   -max_seq_len=120  -max_batch_size=30
+import argparse
+import json
+
+import pandas as pd
+from bert_serving.client import BertClient
+
+server_ip = "192.168.20.68"
+bc = BertClient(ip=server_ip, output_fmt='list')
+
+
+def create_document(doc, emb, index_name):
+    return {
+        '_op_type': 'index',
+        '_index': index_name,
+        'phenomenon': doc["phenomenon"],
+        'phenomenon_vector': emb
+    }
+
+
+def load_dataset(path):
+    docs = []
+    df = pd.read_csv(path)
+    for row in df.iterrows():
+        series = row[1]
+        doc = {
+            'phenomenon': series.phenomenon,
+        }
+        docs.append(doc)
+    return docs
+
+
+def bulk_predict(docs, batch_size=256):
+    """Predict es embeddings."""
+    for i in range(0, len(docs), batch_size):
+        batch_docs = docs[i: i + batch_size]
+        breakdown_show_embeddings = bc.encode([doc['phenomenon'] for doc in batch_docs])
+        for emb in breakdown_show_embeddings:
+            yield emb
+
+
+def main(args):
+    docs = load_dataset(args.data)
+    with open(args.save, 'w') as f:
+        for doc, emb in zip(docs, bulk_predict(docs)):
+            d = create_document(doc, emb, args.index_name)
+            f.write(json.dumps(d) + '\n')
+
+
+# bert转换的结果写到文件中
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Creating elasticsearch documents.')
+    parser.add_argument('--data', default='phenomenon.csv', help='data for creating documents.')
+    parser.add_argument('--save', default='documents.jsonl', help='created documents.')
+    parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
+    args = parser.parse_args()
+    main(args)

+ 90 - 0
es/create_index.py

@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Apr 16 14:54:31 2021
+
+@author: kane
+"""
+import argparse
+
+from elasticsearch import Elasticsearch
+
+# http://192.168.20.72:9200/
+# 链接es服务
+
+
+host = '192.168.20.69:30920'
+es = Elasticsearch([host])
+
+
+def create(index, body=None):
+    """
+    创建索引
+    :param body:
+    :param index: 索引名称
+    :return: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'student1'}
+    """
+    if es.indices.exists(index=index):
+        es.indices.delete(index=index)  # 删除索引
+    res = es.indices.create(index=index, body=body)
+    return res
+
+
+def delete(index):
+    """
+    删除索引
+    :param index: 索引名称
+    :return: True 或 False
+    """
+    if not es.indices.exists(index):
+        return False
+    else:
+        res = es.indices.delete(index=index)
+        return res['acknowledged']
+
+
+def add(index, body, id=None):
+    """
+    (单条数据添加或更新)添加或更新文档记录,更新文档时传对应的id即可
+    使用方法:
+    `
+    body = {"name": "long", "age": 11,"height": 111}
+    add(index=index_name,body=body)
+    或
+    body = {"name": "long", "age": 11,"height": 111}
+    add(index=index_name,body=body,id=1)
+    `
+    :param index: 索引名称
+    :param body:文档内容
+    :param id: 是否指定id,如不指定就会使用生成的字符串
+    :return:{'_index': 'student1', '_type': '_doc', '_id': 'nuwKDXIBujABphC4rbcq', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
+    """
+    res = es.index(index=index, body=body, id=id)
+    return res['_id']  # 返回 id
+
+
+def search(index=None):
+    """
+    查询记录:如果没有索引名称的话默认就会查询全部的索引信息
+    :param index:查询的索引名称
+    :return:
+    """
+    if not index:
+        return es.search()
+    else:
+        return es.search(index=index)
+
+
+def main(args):
+    with open(args.index_file) as index_file:
+        source = index_file.read().strip()
+        print(source)
+        create(index=args.index_name, body=source)
+
+
+if __name__ == '__main__':
+    # parser = argparse.ArgumentParser(description='Creating elasticsearch index.')
+    # parser.add_argument('--index_file', default='index.json', help='Elasticsearch index file.')
+    # parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
+    # args = parser.parse_args()
+    # main(args)
+    delete('fault_meter')

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 0 - 0
es/documents.jsonl


+ 56 - 0
es/find.py

@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+"""
+es余弦相似度搜索
+"""
+from bert_serving.client import BertClient
+
+server_ip = "192.168.20.68"
+bc = BertClient(server_ip)
+
+from elasticsearch import Elasticsearch
+
+host = '192.168.20.69:30920'
+es = Elasticsearch([host])
+
+
+def findRelevantHits(inQuiry):
+    global response
+    inQuiry_vector = bc.encode([inQuiry])[0].tolist()
+    queries = {
+        'es': {
+            "script_score": {
+                "query": {
+                    "match_all": {}
+                },
+                "script": {
+                    "source": "cosineSimilarity(params.inQuiry_vector, doc['phenomenon_vector'])",
+                    "params": {
+                        "inQuiry_vector": inQuiry_vector
+                    }
+                }
+            }
+        },
+        'mlt': {
+            "more_like_this": {
+                "fields": ["phenomenon"],
+                "like": inQuiry,
+                "min_term_freq": 1,
+                "max_query_terms": 50,
+                "min_doc_freq": 1
+            }
+        }
+    }
+
+    result = {'es': [], 'mlt': []}
+
+    for metric, query in queries.items():
+        if metric == 'es':
+            body = {"min_score": 0.9, "query": query, "size": 10, "_source": {"includes": ["phenomenon"]}}
+            response = es.search(index='fault_meter', body=body)
+        result[metric] = [a['_source']['phenomenon'] for a in response['hits']['hits']]
+    return result
+
+
+inQuery = "后备式UPS当负载接近满载时,"
+result = findRelevantHits(inQuery.strip().lower())
+print(result)

+ 16 - 0
es/index.json

@@ -0,0 +1,16 @@
+{
+  "settings": {
+    "number_of_shards": 5,
+    "number_of_replicas": 1
+  },
+  "mappings": {"properties": {
+	  "phenomenon":{
+          "type": "text"
+	},
+       "phenomenon_vector": {
+        "type": "dense_vector",
+        "dims": 768
+      }
+    }
+  }
+}

+ 29 - 0
es/index_documents.py

@@ -0,0 +1,29 @@
+"""
+Example script to index elasticsearch documents.
+"""
+import argparse
+import json
+
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+
+
+def load_dataset(path):
+    with open(path) as f:
+        return [json.loads(line) for line in f]
+
+
+def main(args):
+    host = '192.168.20.69:30920'
+    es = Elasticsearch([host])
+    # 往es索引写数据,docs是一个json集合数据集
+    docs = load_dataset(args.data)
+    bulk(es, docs, index=args.index_name, chunk_size=1000, request_timeout=120)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Indexing elasticsearch documents.')
+    parser.add_argument('--data', default='documents.jsonl', help='Elasticsearch documents.')
+    parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
+    args = parser.parse_args()
+    main(args)

+ 10 - 0
es/phenomenon.csv

@@ -0,0 +1,10 @@
+phenomenon
+有市电时 UPS输出正常,而无市电时蜂鸣器长鸣,无输出
+蓄电池电压偏低,但开机充电十多小时,蓄电池电压仍充不上去
+"UPS开机后,面板上无任何显示,UPS不工作"
+在接入市电的情况下,每次打开 UPS,便听到继电器反复的动作声, UPS面板电池电压过低指示灯长亮且蜂鸣器长鸣
+一台后备 UPS有市电时工作正常, 无市电时逆变器有输出, 但输出电压偏低,同时变压器发出较大的噪音
+在市电供电正常时开启 UPS,逆变器工作指示灯闪烁, 蜂鸣器发出间断叫声,UPS只能工作在逆变状态,不能转换到市电工作状态
+后备式 UPS当负载接近满载时,市电供电正常,而蓄电池供电时蓄电池保险丝熔断
+UPS只能由市电供电而不能转为逆变供电
+空载通电,不能转市电,也无逆变,机器无任何运作,蜂鸣器也无响声

+ 3 - 0
es/requirements.txt

@@ -0,0 +1,3 @@
+bert-serving-client==1.9.8
+elasticsearch==7.0.4
+pandas==0.25.1

+ 43 - 0
es/run.py

@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 21 09:17:19 2021
+创建索引。处理文本获取文本向量,文本添加至索引
+@author: kane
+"""
+import argparse
+import create_documents
+import index_documents
+import create_index
+
+
+def main():
+    try:
+        parser = argparse.ArgumentParser(description='Creating elasticsearch index.')
+        parser.add_argument('--index_file', default='index.json', help='Elasticsearch index file.')
+        parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
+        args = parser.parse_args()
+        create_index.main(args)
+    except:
+        print('索引创建失败')
+    try:
+        parser = argparse.ArgumentParser(description='Creating elasticsearch documents.')
+        parser.add_argument('--data', default='phenomenon.csv', help='data for creating documents.')
+        parser.add_argument('--save', default='documents.jsonl', help='created documents.')
+        parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
+        args = parser.parse_args()
+        create_documents.main(args)
+    except:
+        print('文本处理失败')
+    try:
+        parser = argparse.ArgumentParser(description='Indexing elasticsearch documents.')
+        parser.add_argument('--data', default='documents.jsonl', help='Elasticsearch documents.')
+        parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
+        args = parser.parse_args()
+        index_documents.main(args)
+    except:
+        print('文本添加至索引失败')
+    print('完成任务')
+
+
+if __name__ == '__main__':
+    main()

Algunos archivos no se mostraron porque demasiados archivos cambiaron en este cambio