python solr数据导出脚本
import pysolr
import json
#更改url需要修改的地方,查看文章id,修改search_sql,修改要更改的modify_url
search_sql='*:*'
modify_url='http://www.baidu.com/111'
if __name__=="__main__":
#solr地址,如果实时提交always_commit=True
url="http://192.168.1.185:8080/solr/kap"
s=pysolr.Solr(url,always_commit=True)
#新建json文件
f=open('json_data.json','w+',encoding="utf-8")
#查询语句
#一次查询数量
one_count=1
#需要过滤的字段,例如COPY字段,需要跳过
caption_fields=['KNOWLEDGE_ID','author_s','suggestion','danweiaddr','multizuozhes','multizuozhedanwei','multitopic','multilabel','multizuozhe']
data_num_count=int(s.search(search_sql,start=0,rows=1).raw_response['response']['numFound'])
s_num=data_num_count
print('数据总量:',data_num_count)
for page in range(0,data_num_count,one_count):
result_src=s.search(search_sql,sort='feachdataid asc',start=page,rows=one_count)
#解析单条数据
for data in result_src.docs:
fields=data.keys()
tmp_doc={}
for cur_field in fields:
if cur_field in caption_fields:
data[cur_field]=''
continue
if cur_field=='_version_':
continue
if cur_field=='url':
tmp_doc[cur_field]=data[cur_field]
continue
if cur_field=='id' or cur_field=='content' or cur_field=='abstract' or cur_field=='title' or cur_field=='zuozhe' or cur_field=='zuozhedanwei' or cur_field=='keywords':
tmp_doc[cur_field]=data[cur_field]
continue
tmp_doc[cur_field]=data[cur_field]
s_num-=1
print("剩余数量",s_num,":::::使用时间::::")
f.write(json.dumps(tmp_doc,ensure_ascii=False)+"\n")
f.flush()
f.close()
#print(s.add(tmp_doc),"剩余数量",s_num,":::::使用时间::::")