Files
Contract-document-crawling-…/商品明细.py
2026-01-19 10:46:05 +08:00

221 lines
7.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import time
import os
import pandas as pd
import re
# ================= 1. 配置区域 =================
base_url = "http://111.198.24.44:88/index.php"
# 登录信息
login_payload = {
"module": "Users",
"action": "Authenticate",
"return_module": "Users",
"return_action": "Login",
"user_name": "TEST", # <--- 【请修改】这里填用户名
"user_password": "****", # <--- 【请修改】这里填密码
"login_theme": "newskin"
}
# 列表查询参数
list_payload = {
"module": "SalesOrder",
"action": "SalesOrderAjax",
"file": "ListViewData",
"sorder": "",
"start": "1",
"pagesize": "50",
"actionId": "",
"isFilter": "true",
"search[viewscope]": "all_to_me",
"search[viewname]": "324126",
# 筛选条件
"filter[Fields0]": "subject",
"filter[Condition0]": "cts",
"filter[Srch_value0]": "W25A",
"filter[type0]": "text",
"filter[dateCondition1]": "prevfy",
"filter[Fields1]": "duedate",
"filter[Condition1]": "btwa",
"filter[Srch_value1]": "2025-01-01,2025-12-31",
"filter[type1]": "date",
"filter[Fields2]": "subject",
"filter[Condition2]": "dcts",
"filter[Srch_value2]": "取消",
"filter[type2]": "text",
"filter[search_cnt]": "3",
"filter[matchtype]": "all"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Referer": "http://111.198.24.44:88/index.php?module=SalesOrder&action=index"
}
# ================= 2. 辅助工具 =================
def get_timestamp():
return int(time.time() * 1000)
def extract_nested_value(item, key):
"""提取 {'value': '...'} 结构的值"""
if not item or key not in item:
return ""
val = item[key]
if isinstance(val, dict) and 'value' in val:
return val['value']
return val
def clean_html(text):
"""清洗HTML标签只留纯文本"""
if not isinstance(text, str): return str(text)
text = re.sub(r'<[^>]+>', '', text)
return text.strip()
# ================= 3. 主程序 =================
def main():
session = requests.Session()
all_rows = []
try:
# --- 1. 登录 ---
print("1. 正在登录...")
session.post(base_url, data=login_payload, headers=headers)
# --- 2. 获取列表 ---
print("2. 获取订单列表...")
list_payload['actionId'] = get_timestamp()
res = session.post(base_url, data=list_payload, headers=headers)
raw_data = res.json()
orders = []
# 列表解析
if isinstance(raw_data, list):
orders = raw_data
elif isinstance(raw_data, dict):
for k in ['entries', 'rows', 'data', 'records']:
if k in raw_data and isinstance(raw_data[k], list):
orders = raw_data[k]
break
if not orders:
for v in raw_data.values():
if isinstance(v, list) and len(v) > 0:
orders = v
break
print(f"✅ 找到 {len(orders)} 个订单,开始处理...")
# --- 3. 逐个提取 ---
for i, order in enumerate(orders):
rid = order.get('crmid') or order.get('salesorderid') or order.get('id')
if not rid: continue
# 列表页基本信息
contract_no = clean_html(order.get('subject', ''))
salesperson = order.get('assigned_user_id', '') or order.get('smownerid', '')
print(f" [{i + 1}/{len(orders)}] 提取: {contract_no}")
# 请求产品详情
detail_payload = {
"module": "Plugins",
"pluginName": "DetailProductTable",
"action": "getTableData",
"moduleName": "SalesOrder",
"record": rid,
"actionId": get_timestamp(),
"isTool": "1"
}
try:
detail_res = session.post(base_url, data=detail_payload, headers=headers)
detail_json = detail_res.json()
# 寻找产品列表 data
products = []
raw_data_content = detail_json.get('data')
if isinstance(raw_data_content, list):
products = raw_data_content
elif isinstance(raw_data_content, dict):
if 'rows' in raw_data_content:
products = raw_data_content['rows']
else:
for v in raw_data_content.values():
if isinstance(v, dict) and ('productid' in v or 'productname' in v):
products.append(v)
if not products:
continue
# --- 核心:严格按你要求的表头填充 ---
for prod in products:
row_data = {
# === 第一部分:确定的字段 ===
"合同编号": contract_no,
"销售员": salesperson,
"厂家": prod.get('cf_2128', ''), # ASD
"货号": prod.get('productcode', ''), # 135636
"产品描述": prod.get('productname', ''), # Full Range...
"数量": extract_nested_value(prod, 'qty'),
"单位": prod.get('usageunit', ''), # 通常字段,如果没有也没关系
"币种": prod.get('cf_534', ''), # USD
"报价单价": extract_nested_value(prod, 'listPrice'), # 4022.20
"报价总价": extract_nested_value(prod, 'subtotal'), # 4022.20
# === 第二部分:按照指示全部留空的字段 ===
"销售单价": "",
"销售总价": "",
"折扣率": "",
"外购": "",
"合同币种/美元": "",
"外购转美元": "",
"报价总价美元": "",
"净合同额美元": ""
}
all_rows.append(row_data)
except Exception as e:
print(f" ❌ 解析错误: {e}")
time.sleep(0.1)
# --- 4. 生成 Excel ---
if all_rows:
# 严格按照你的表头顺序定义
strict_columns = [
'合同编号', '销售员', '厂家', '货号', '产品描述',
'数量', '单位', '币种', '报价单价', '报价总价',
'销售单价', '销售总价', '折扣率', '外购',
'合同币种/美元', '外购转美元', '报价总价美元', '净合同额美元'
]
df = pd.DataFrame(all_rows)
# 确保列存在
for col in strict_columns:
if col not in df.columns:
df[col] = ""
# 强制列顺序
df = df[strict_columns]
filename = "Strict_Format_Export.xlsx"
df.to_excel(filename, index=False)
print(f"\n✅ 表格生成成功!已严格留空指定列,保存至: {os.path.abspath(filename)}")
else:
print("\n❌ 未提取到数据。")
except Exception as e:
print(f"❌ 程序错误: {e}")
if __name__ == "__main__":
main()