Files
Contract-document-crawling-…/搜索获取数据.py
2026-01-18 11:31:40 +08:00

371 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import time
import os
from lxml import html
import re
import urllib.parse
import pandas as pd # ★ 引入pandas用于处理多Sheet Excel
# ================= 1. 配置区域 =================
base_url = "http://111.198.24.44:88/index.php"
# 登录参数
login_payload = {
"module": "Users",
"action": "Authenticate",
"return_module": "Users",
"return_action": "Login",
"user_name": "TEST", # 请填入真实用户名
"user_password": "****", # 请填入真实密码
"login_theme": "newskin"
}
# 全局 HTTP 请求头
http_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Referer": "http://111.198.24.44:88/index.php?module=SalesOrder&action=index"
}
# ================= 2. 核心辅助函数 =================
def get_current_action_id():
"""生成当前时间的13位时间戳"""
return int(time.time() * 1000)
def clean_text_structure(element):
"""深度清洗函数"""
if element is None:
return ""
import copy
el = copy.deepcopy(element)
for bad_tag in el.xpath('.//script | .//style | .//noscript'):
bad_tag.drop_tree()
for br in el.xpath('.//br'):
br.tail = "\n" + (br.tail if br.tail else "")
text_content = el.text_content()
lines = []
for line in text_content.splitlines():
clean_line = line.replace('\xa0', ' ').strip()
if clean_line:
lines.append(clean_line)
return "\n".join(lines)
def extract_html_content(html_content, xpath):
try:
tree = html.fromstring(html_content)
elements = tree.xpath(xpath)
if elements:
target_element = elements[0]
raw_html = html.tostring(target_element, encoding='unicode', pretty_print=True)
cleaned_text = clean_text_structure(target_element)
return {"raw_html": raw_html, "cleaned_text": cleaned_text}
else:
return {"raw_html": "", "cleaned_text": ""}
except Exception as e:
print(f" ❌ HTML解析错误: {e}")
return {"raw_html": "", "cleaned_text": ""}
def fetch_html_detail(session, record_id, xpath):
try:
url = f"http://111.198.24.44:88/index.php?module=SalesOrder&action=DetailView&record={record_id}"
resp = session.get(url, headers=http_headers)
if resp.status_code == 200:
return extract_html_content(resp.content, xpath)
return {"raw_html": "", "cleaned_text": ""}
except Exception:
return {"raw_html": "", "cleaned_text": ""}
def extract_crmid_from_search_result(html_content):
crmids = []
try:
tree = html.fromstring(html_content)
links = tree.xpath('//div[@id="collapse-SalesOrder"]//a[contains(@onclick, "record=")]')
if not links:
links = tree.xpath('//a[contains(@onclick, "module=SalesOrder") and contains(@onclick, "record=")]')
for link in links:
onclick = link.get('onclick', '')
match = re.search(r"record=(\d+)", onclick)
if match:
crmid = match.group(1)
if crmid not in crmids:
crmids.append(crmid)
return crmids
except Exception:
return []
def perform_search(session, query_string):
try:
search_url = f"http://111.198.24.44:88/index.php?module=Home&action=UnifiedSearch&selectedmodule=undefined&query_string={query_string}"
resp = session.get(search_url, headers=http_headers)
if resp.status_code == 200:
return extract_crmid_from_search_result(resp.content)
return []
except Exception:
return []
# ================= 3. 核心解析逻辑 =================
def parse_order_text(text):
"""
解析文本,返回通用字典
"""
if not text:
return {}
# 初始化通用字段池 (包含内贸和外贸所有可能用到的字段)
data = {
"合同编号": "", "内贸合同号": "", "外贸合同号": "",
"签署公司": "", "收款情况": "", "签订日期": "", "销售员": "",
"最终用户单位": "", "最终用户信息联系人": "", "最终用户信息电话": "", "最终用户信息邮箱": "",
"最终用户所在地": "",
"买方单位": "", "买方信息联系人": "", "买方信息电话": "", "买方信息邮箱": "",
"厂家型号": "", "合同标的": "", "数量": "", "单位": "台/套",
"折扣率(%)": "", "合同额": "", "合同总额": "",
"外购付款方式": "", "最晚发货期": "",
"已收款": "", "未收款": "", "收款日期": ""
}
lines = [line.strip() for line in text.split('\n') if line.strip()]
# 映射表文本中的Key -> 数据字典中的Key
key_map = {
"收款账户": "签署公司",
"收款状态": "收款情况",
"签约日期": "签订日期",
"负责人": "销售员",
"客户名称": "最终用户单位",
"联系人姓名": "最终用户信息联系人",
"合同总额": "合同总额",
"最新收款日期": "收款日期",
"最晚发货期": "最晚发货期",
"付款比例及期限": "外购付款方式", # 这里对应您的要求
"地址": "最终用户所在地"
}
for i, line in enumerate(lines):
# 1.0 合同订单编号处理
if line == "合同订单编号":
if i + 1 < len(lines):
full_val = lines[i + 1].strip()
parts = full_val.split()
if len(parts) >= 1:
data["合同编号"] = parts[0]
# 判断第二部分是内贸号还是外贸号暂时先都存起来在外面根据W/N区分
if len(parts) >= 2:
# 临时存储,稍后在 main 函数里根据 W/N 决定赋给谁
data["_temp_second_code"] = parts[1]
# 1.1 常规映射
elif line in key_map:
if i + 1 < len(lines):
target_key = key_map[line]
if not data[target_key]:
data[target_key] = lines[i + 1]
# 1.2 产品行解析
elif "合同标的" in line and "品名/型号" in line:
if i + 1 < len(lines):
parts = lines[i + 1].split('/')
# 格式假设: 标的/型号/数量/单价/总价
if len(parts) >= 1: data["合同标的"] = parts[0]
if len(parts) >= 2: data["厂家型号"] = parts[1]
if len(parts) >= 3: data["数量"] = parts[2]
if len(parts) >= 5: data["合同额"] = parts[4]
# 1.3 折扣率 (如果有这个字段的话,通常在产品附近)
# 这里假设如果没有明确字段,暂留空,或者您有特定的关键词提取逻辑
# 2. 正则提取买方信息
buyer_match = re.search(r"(?:买方|The Buyer)[:]\s*(.*?)(?:\n|$)", text)
if buyer_match and len(buyer_match.group(1)) > 1:
data["买方单位"] = buyer_match.group(1).strip()
buyer_contact = re.search(r"联系人Contact person[:]\s*(.*?)(?:\n|$)", text)
if buyer_contact:
data["买方信息联系人"] = buyer_contact.group(1).strip()
buyer_tel = re.search(r"电话\(Tel\)[:]\s*(.*?)(?:\s+|$|传真)", text)
if buyer_tel:
data["买方信息电话"] = buyer_tel.group(1).strip()
# 3. 计算已收/未收
try:
total = float(data["合同总额"]) if data["合同总额"] else 0
status = data["收款情况"]
if "已收" in status:
data["已收款"] = str(total)
data["未收款"] = "0"
elif "" in status:
data["已收款"] = "0"
data["未收款"] = str(total)
except:
pass
return data
# ================= 4. 主程序逻辑 =================
def main():
session = requests.Session()
target_xpath = "/html/body/div[1]/div/div[2]/div[2]/form/div[1]/div[1]/div[2]"
try:
# --- 1. 登录 ---
print("1. 正在登录...")
session.post(base_url, data=login_payload, headers=http_headers)
if 'PHPSESSID' in session.cookies:
print(" ✅ 登录成功")
else:
print(" ⚠️ 警告: 未检测到Cookie可能登录失败")
# --- 2. 搜索 ---
print("\n2. 请输入搜索内容:")
query_input = input(" 搜索关键词: ").strip()
if not query_input: return
encoded_query = urllib.parse.quote(query_input)
print(f"\n3. 执行搜索...")
crmids = perform_search(session, encoded_query)
if not crmids:
print(" ❌ 未找到相关订单。")
return
print(f" ✅ 找到 {len(crmids)} 个订单 ID: {crmids}")
# --- 3. 抓取与分类 ---
print(f"\n4. 开始获取详情并分类处理...")
# 定义三个列表用于存储不同类型的数据
list_domestic = [] # 内贸 (N开头)
list_foreign = [] # 外贸 (W开头)
list_other = [] # 其他
valid_count = 0
for i, crmid in enumerate(crmids):
print(f" [{i + 1}/{len(crmids)}] 处理 ID: {crmid}")
html_data = fetch_html_detail(session, crmid, target_xpath)
clean_text = html_data['cleaned_text']
# 解析
data = parse_order_text(clean_text)
contract_no = data.get("合同编号", "").strip().upper() # 转大写处理
# ★ 过滤空数据
if not contract_no:
print(f" ⚠️ 跳过: 未找到合同编号")
continue
# ★ 核心分类逻辑
second_code = data.pop("_temp_second_code", "") # 取出临时存的第二段编号
if contract_no.startswith('W'):
# 外贸
data['外贸合同号'] = second_code
list_foreign.append(data)
print(f" 🌍 归类: [外贸] {contract_no}")
elif contract_no.startswith('N'):
# 内贸
data['内贸合同号'] = second_code
list_domestic.append(data)
print(f" 🏠 归类: [内贸] {contract_no}")
else:
# 其他
data['内贸合同号'] = second_code # 默认存这里
list_other.append(data)
print(f" ❓ 归类: [其他] {contract_no}")
valid_count += 1
time.sleep(0.5)
# --- 4. 导出 Excel (多Sheet) ---
print(f"\n5. 正在导出 Excel 文件...")
if valid_count == 0:
print(" ❌ 无有效数据导出")
return
timestamp = time.strftime("%Y%m%d_%H%M%S")
output_dir = f"Result_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
xlsx_filename = os.path.join(output_dir, f"Export_{query_input}_{timestamp}.xlsx")
# 定义列顺序 (表头)
# 内贸表头
cols_domestic = [
"合同编号", "签署公司", "内贸合同号", "收款情况", "签订日期", "销售员",
"最终用户单位", "最终用户信息联系人", "最终用户信息电话", "最终用户信息邮箱", "最终用户所在地",
"买方单位", "买方信息联系人", "买方信息电话", "买方信息邮箱",
"厂家型号", "合同标的", "数量", "单位", "折扣率(%)", "合同额", "合同总额",
"外购付款方式", "最晚发货期", "已收款", "未收款", "收款日期"
]
# 外贸表头 (参考内贸稍作调整)
cols_foreign = [
"合同编号", "签署公司", "外贸合同号", "收款情况", "签订日期", "销售员",
"最终用户单位", "最终用户信息联系人", "最终用户信息电话", "最终用户信息邮箱", "最终用户所在地",
"买方单位", "买方信息联系人", "买方信息电话", "买方信息邮箱",
"厂家型号", "合同标的", "数量", "单位", "折扣率(%)", "合同额", "合同总额",
"外购付款方式", "最晚发货期", "已收款", "未收款", "收款日期"
]
# 使用 Pandas ExcelWriter 写入多个 Sheet
try:
with pd.ExcelWriter(xlsx_filename, engine='openpyxl') as writer:
# 1. 写入内贸 Sheet
if list_domestic:
df_domestic = pd.DataFrame(list_domestic)
# 按照指定列顺序排列,如果数据里没有该列会自动填空
df_domestic = df_domestic.reindex(columns=cols_domestic)
df_domestic.to_excel(writer, sheet_name='内贸', index=False)
# 2. 写入外贸 Sheet
if list_foreign:
df_foreign = pd.DataFrame(list_foreign)
df_foreign = df_foreign.reindex(columns=cols_foreign)
df_foreign.to_excel(writer, sheet_name='外贸', index=False)
# 3. 写入其他 Sheet
if list_other:
df_other = pd.DataFrame(list_other)
# 其他表也暂用内贸的表头格式
df_other = df_other.reindex(columns=cols_domestic)
df_other.to_excel(writer, sheet_name='其他', index=False)
print(f" ✅ 成功导出多Sheet表格: {os.path.abspath(xlsx_filename)}")
print(f" - 内贸: {len(list_domestic)}")
print(f" - 外贸: {len(list_foreign)}")
print(f" - 其他: {len(list_other)}")
except ImportError:
print(" ❌ 错误: 缺少 pandas 或 openpyxl 库。")
print(" 请在终端运行: pip install pandas openpyxl")
except Exception as e:
print(f" ❌ 写入 Excel 失败: {e}")
except Exception as e:
print(f"\n❌ 程序发生错误: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()