123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- # -*- coding: utf-8 -*-
- """
- Created on Fri Apr 16 14:54:31 2021
- @author: kane
- """
- # /home/Anaconda3/bin/es-serving-start -model_dir /home/NLP_bert/chinese_roberta_wwm_ext_L-12_H-768_A-12 -max_seq_len=120 -max_batch_size=30
- import argparse
- import json
- import pandas as pd
- from bert_serving.client import BertClient
- server_ip = "192.168.20.68"
- bc = BertClient(ip=server_ip, output_fmt='list')
- def create_document(doc, emb, index_name):
- return {
- '_op_type': 'index',
- '_index': index_name,
- 'phenomenon': doc["phenomenon"],
- 'phenomenon_vector': emb
- }
- def load_dataset(path):
- docs = []
- df = pd.read_csv(path)
- for row in df.iterrows():
- series = row[1]
- doc = {
- 'phenomenon': series.phenomenon,
- }
- docs.append(doc)
- return docs
- def bulk_predict(docs, batch_size=256):
- """Predict es embeddings."""
- for i in range(0, len(docs), batch_size):
- batch_docs = docs[i: i + batch_size]
- breakdown_show_embeddings = bc.encode([doc['phenomenon'] for doc in batch_docs])
- for emb in breakdown_show_embeddings:
- yield emb
- def main(args):
- docs = load_dataset(args.data)
- with open(args.save, 'w') as f:
- for doc, emb in zip(docs, bulk_predict(docs)):
- d = create_document(doc, emb, args.index_name)
- f.write(json.dumps(d) + '\n')
- # bert转换的结果写到文件中
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Creating elasticsearch documents.')
- parser.add_argument('--data', default='phenomenon.csv', help='data for creating documents.')
- parser.add_argument('--save', default='documents.jsonl', help='created documents.')
- parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.')
- args = parser.parse_args()
- main(args)
|