Files
Contract-document-crawling-…/搜索获取数据.py
2026-01-17 13:40:52 +08:00

420 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import time
import os
from lxml import html
import re
# ================= 1. 配置区域 =================
base_url = "http://111.198.24.44:88/index.php"
# 登录参数
login_payload = {
"module": "Users",
"action": "Authenticate",
"return_module": "Users",
"return_action": "Login",
"user_name": "TEST", # 在这里填入真实的用户名
"user_password": "test", # 在这里填入真实的密码
"login_theme": "newskin"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Referer": "http://111.198.24.44:88/index.php?module=SalesOrder&action=index"
}
# ================= 2. 辅助函数 =================
def get_current_action_id():
"""生成当前时间的13位时间戳"""
return int(time.time() * 1000)
def clean_html_tags(text):
"""清洗HTML标签保留文本内容"""
if not text:
return ""
# 移除HTML标签
clean_text = re.sub(r'<[^>]+>', ' ', text)
# 替换HTML实体
clean_text = clean_text.replace('&nbsp;', ' ')
# 合并多个空格和换行符
clean_text = re.sub(r'\s+', ' ', clean_text)
# 去除首尾空格
clean_text = clean_text.strip()
return clean_text
def extract_html_content(html_content, xpath):
"""从HTML中提取指定XPath的内容"""
try:
# 解析HTML
tree = html.fromstring(html_content)
# 尝试提取指定XPath的内容
elements = tree.xpath(xpath)
if elements:
# 获取元素的HTML内容
element_html = html.tostring(elements[0], encoding='unicode', pretty_print=True)
# 清洗HTML标签
cleaned_text = clean_html_tags(element_html)
# 同时保留原始HTML和清洗后的文本
return {
"raw_html": element_html,
"cleaned_text": cleaned_text
}
else:
print(f" ⚠️ 未找到XPath: {xpath}")
return {
"raw_html": "",
"cleaned_text": ""
}
except Exception as e:
print(f" ❌ HTML解析错误: {e}")
return {
"raw_html": "",
"cleaned_text": ""
}
def fetch_html_detail(session, record_id, xpath):
"""获取HTML页面详情并提取指定XPath内容"""
try:
# 构造HTML详情页URL
html_url = f"http://111.198.24.44:88/index.php?module=SalesOrder&action=DetailView&record={record_id}"
# 获取HTML页面
html_response = session.get(html_url, headers=headers)
if html_response.status_code == 200:
# 提取指定XPath的内容
extracted_content = extract_html_content(html_response.content, xpath)
return extracted_content
else:
print(f" ❌ HTML页面请求失败: HTTP {html_response.status_code}")
return {
"raw_html": "",
"cleaned_text": ""
}
except Exception as e:
print(f" ❌ 获取HTML详情失败: {e}")
return {
"raw_html": "",
"cleaned_text": ""
}
def extract_crmid_from_search_result(html_content):
"""从搜索结果页面提取CRM ID - 修正版本"""
crmids = []
try:
# 解析HTML
tree = html.fromstring(html_content)
# 首先找到SalesOrder模块的div
sales_order_div = tree.xpath('//div[@class="collapse in" and @id="collapse-SalesOrder"]')
if not sales_order_div:
print(" ⚠️ 未找到SalesOrder模块的搜索结果")
# 保存HTML用于调试
with open("debug_no_salesorder.html", "w", encoding="utf-8") as f:
f.write(html.tostring(tree, encoding='unicode', pretty_print=True))
return crmids
print(" ✅ 找到SalesOrder模块")
# 在新的div下查找特定的XPath模式
# 基础XPath/html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]/div[2]/div[2]/a
# 我们需要找到所有符合这个模式的链接其中倒数第二个div的索引会变化
# 方法1使用通用XPath匹配模式
# 匹配所有符合 /html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]/div[n]/div[2]/a 的链接
# 其中n从2开始递增
# 构建通用XPath查找所有在特定层级下的a标签
base_path = "/html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]"
# 查找所有可能的div[n]层级
n = 2
while True:
xpath_pattern = f"{base_path}/div[{n}]/div[2]/a"
elements = tree.xpath(xpath_pattern)
if not elements:
# 如果这个n没有找到尝试下一个n+1
# 但我们先检查一下如果n>10还没找到可能就没有了
if n > 20: # 设置一个上限
break
n += 1
continue
# 找到元素提取record值
for element in elements:
onclick_attr = element.get('onclick', '')
if onclick_attr:
# 从onclick中提取record值
match = re.search(r"record=(\d+)", onclick_attr)
if match:
crmid = match.group(1)
if crmid not in crmids:
crmids.append(crmid)
print(f" 从XPath {xpath_pattern} 找到CRM ID: {crmid}")
n += 1
# 方法2备用方法 - 查找所有包含module=SalesOrder的链接
if not crmids:
print(" 尝试备用方法查找CRM ID...")
salesorder_links = tree.xpath('//a[contains(@onclick, "module=SalesOrder")]')
for link in salesorder_links:
onclick_attr = link.get('onclick', '')
if onclick_attr:
match = re.search(r"record=(\d+)", onclick_attr)
if match:
crmid = match.group(1)
if crmid not in crmids:
crmids.append(crmid)
print(f" 备用方法找到 {len(crmids)} 个CRM ID")
# 去重并返回
unique_crmids = list(set(crmids))
print(f" 去重后找到 {len(unique_crmids)} 个唯一的CRM ID")
return unique_crmids
except Exception as e:
print(f" ❌ 解析搜索结果失败: {e}")
import traceback
traceback.print_exc()
return crmids
def perform_search(session, query_string):
"""执行搜索并返回CRM ID列表"""
try:
# 构造搜索URL
search_url = f"http://111.198.24.44:88/index.php?module=Home&action=UnifiedSearch&selectedmodule=undefined&query_string={query_string}"
print(f" 正在搜索: {query_string}")
print(f" 搜索URL: {search_url}")
# 获取搜索结果页面
search_response = session.get(search_url, headers=headers)
if search_response.status_code != 200:
print(f" ❌ 搜索请求失败: HTTP {search_response.status_code}")
return []
# 保存搜索结果用于调试
with open("debug_search_result.html", "w", encoding="utf-8") as f:
f.write(search_response.text)
print(" 搜索结果已保存到 debug_search_result.html")
# 提取CRM ID
crmids = extract_crmid_from_search_result(search_response.content)
return crmids
except Exception as e:
print(f" ❌ 搜索失败: {e}")
return []
# ================= 3. 主程序逻辑 =================
def main():
session = requests.Session()
# 指定要提取的XPath
target_xpath = "/html/body/div[1]/div/div[2]/div[2]/form/div[1]/div[1]/div[2]"
try:
# --- 第一步:登录 ---
print("1. 正在登录...")
login_response = session.post(base_url, data=login_payload, headers=headers)
# 检查是否拿到 Cookie
if 'PHPSESSID' not in session.cookies:
print("⚠️ 警告:未检测到 PHPSESSID登录可能失败后续操作可能会出错。")
else:
print(" ✅ 登录成功Cookie 已获取。")
# --- 第二步:获取用户搜索输入 ---
print("\n2. 请输入搜索内容:")
query_string = input(" 搜索关键词: ").strip()
if not query_string:
print(" ❌ 未输入搜索内容,程序退出。")
return
# 对查询字符串进行URL编码
import urllib.parse
encoded_query = urllib.parse.quote(query_string)
# --- 第三步执行搜索并提取CRM ID ---
print(f"\n3. 正在执行搜索并提取CRM ID...")
crmids = perform_search(session, encoded_query)
if not crmids:
print(" ❌ 未找到任何CRM ID程序退出。")
return
print(f" ✅ 成功提取到 {len(crmids)} 个CRM ID: {crmids}")
# --- 第四步循环获取每个CRM ID的详情 ---
print(f"\n4. 开始逐个获取订单详情...")
success_count = 0
orders_data = []
for index, crmid in enumerate(crmids):
print(f"\n [{index + 1}/{len(crmids)}] 处理CRM ID: {crmid}")
# 1. 获取JSON详情 (产品详情)
json_detail = None
detail_payload = {
"module": "Plugins",
"pluginName": "DetailProductTable",
"action": "getTableData",
"moduleName": "SalesOrder",
"record": crmid,
"actionId": get_current_action_id(),
"isTool": "1"
}
try:
# 请求JSON详情
detail_resp = session.post(base_url, data=detail_payload, headers=headers)
json_detail = detail_resp.json()
print(f" ✅ JSON详情获取成功")
except Exception as e:
print(f" ❌ JSON详情获取失败: {e}")
json_detail = {"error": str(e)}
# 2. 获取HTML详情并提取指定XPath内容
print(f" 正在获取HTML详情...")
html_content = fetch_html_detail(session, crmid, target_xpath)
# 3. 构建订单数据
order_data = {
"crmid": crmid,
"json_details": json_detail,
"html_details": html_content,
"combined_data": {
"crmid": crmid,
"json_data": json_detail,
"html_extracted_text": html_content.get("cleaned_text", ""),
"html_raw": html_content.get("raw_html", "")
}
}
orders_data.append(order_data)
success_count += 1
print(f" ✅ CRM ID {crmid} 处理完成")
# 礼貌性延时,避免请求过快
time.sleep(0.5)
# --- 第五步:保存结果 ---
print(f"\n5. 正在保存结果...")
# 创建存储目录
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"crm_data_search_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
# 保存搜索查询信息
search_info = {
"query_string": query_string,
"encoded_query": encoded_query,
"search_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"crmids_found": crmids,
"total_count": len(crmids)
}
with open(os.path.join(output_dir, "search_info.json"), 'w', encoding='utf-8') as f:
json.dump(search_info, f, ensure_ascii=False, indent=4)
# 保存完整的合并数据
full_filename = os.path.join(output_dir, "all_orders_combined.json")
with open(full_filename, 'w', encoding='utf-8') as f:
json.dump(orders_data, f, ensure_ascii=False, indent=4)
# 同时按crmid分别存储
print(f" 正在按CRM ID分别存储文件...")
for order in orders_data:
record_id = order.get('crmid')
if record_id:
# 单独保存每个crmid的数据
single_filename = os.path.join(output_dir, f"crm_{record_id}.json")
with open(single_filename, 'w', encoding='utf-8') as f:
json.dump(order, f, ensure_ascii=False, indent=4)
# 保存提取的文本内容为文本文件,便于查看
text_filename = os.path.join(output_dir, "extracted_texts.txt")
with open(text_filename, 'w', encoding='utf-8') as f:
f.write(f"=== 搜索查询: {query_string} ===\n")
f.write(f"=== 提取时间: {time.strftime('%Y-%m-%d %H:%M:%S')} ===\n")
f.write(f"=== 共找到 {len(crmids)} 个结果 ===\n\n")
for order in orders_data:
record_id = order.get('crmid')
if record_id:
extracted_text = order.get('html_details', {}).get('cleaned_text', '')
if extracted_text:
f.write(f"\n--- CRM ID: {record_id} ---\n")
f.write(f"{extracted_text}\n")
f.write("-" * 50 + "\n")
# 创建CSV格式的摘要文件
csv_filename = os.path.join(output_dir, "summary.csv")
with open(csv_filename, 'w', encoding='utf-8') as f:
f.write("CRM ID,提取文本长度,JSON数据状态\n")
for order in orders_data:
record_id = order.get('crmid')
text_length = len(order.get('html_details', {}).get('cleaned_text', ''))
json_status = "成功" if order.get('json_details') and not order.get('json_details').get(
'error') else "失败"
f.write(f"{record_id},{text_length},{json_status}\n")
print(f"\n✅ 全部完成!")
print(f" 成功处理: {success_count}/{len(crmids)} 个CRM ID")
print(f" 文件保存目录: {os.path.abspath(output_dir)}")
print(f" 主要文件:")
print(f" - search_info.json (搜索信息)")
print(f" - all_orders_combined.json (所有数据)")
print(f" - extracted_texts.txt (提取的文本)")
print(f" - summary.csv (数据摘要)")
print(f" - 按CRM ID单独存储的 {success_count} 个JSON文件")
# 显示提取的文本预览
print(f"\n=== 提取文本预览 ===")
for i, order in enumerate(orders_data[:3]): # 只显示前3个
record_id = order.get('crmid')
extracted_text = order.get('html_details', {}).get('cleaned_text', '')
preview = extracted_text[:100] + "..." if len(extracted_text) > 100 else extracted_text
print(f"CRM ID {record_id}: {preview}")
if len(orders_data) > 3:
print(f"... 还有 {len(orders_data) - 3} 个未显示")
except Exception as e:
print(f"\n❌ 程序发生未捕获的错误: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()