# -*- coding: utf-8 -*- """ Created on Fri Apr 16 14:54:31 2021 @author: kane """ # /home/Anaconda3/bin/es-serving-start -model_dir /home/NLP_bert/chinese_roberta_wwm_ext_L-12_H-768_A-12 -max_seq_len=120 -max_batch_size=30 import argparse import json import pandas as pd from bert_serving.client import BertClient server_ip = "192.168.20.68" bc = BertClient(ip=server_ip, output_fmt='list') def create_document(doc, emb, index_name): return { '_op_type': 'index', '_index': index_name, 'phenomenon': doc["phenomenon"], 'phenomenon_vector': emb } def load_dataset(path): docs = [] df = pd.read_csv(path) for row in df.iterrows(): series = row[1] doc = { 'phenomenon': series.phenomenon, } docs.append(doc) return docs def bulk_predict(docs, batch_size=256): """Predict es embeddings.""" for i in range(0, len(docs), batch_size): batch_docs = docs[i: i + batch_size] breakdown_show_embeddings = bc.encode([doc['phenomenon'] for doc in batch_docs]) for emb in breakdown_show_embeddings: yield emb def main(args): docs = load_dataset(args.data) with open(args.save, 'w') as f: for doc, emb in zip(docs, bulk_predict(docs)): d = create_document(doc, emb, args.index_name) f.write(json.dumps(d) + '\n') # bert转换的结果写到文件中 if __name__ == '__main__': parser = argparse.ArgumentParser(description='Creating elasticsearch documents.') parser.add_argument('--data', default='phenomenon.csv', help='data for creating documents.') parser.add_argument('--save', default='documents.jsonl', help='created documents.') parser.add_argument('--index_name', default='fault_meter', help='Elasticsearch index name.') args = parser.parse_args() main(args)