搜索测试成功
This commit is contained in:
326
拿取内容.py
Normal file
326
拿取内容.py
Normal file
@ -0,0 +1,326 @@
|
|||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
from lxml import html
|
||||||
|
import re
|
||||||
|
|
||||||
|
# ================= 1. 配置区域 =================
|
||||||
|
base_url = "http://111.198.24.44:88/index.php"
|
||||||
|
|
||||||
|
# 登录参数
|
||||||
|
login_payload = {
|
||||||
|
"module": "Users",
|
||||||
|
"action": "Authenticate",
|
||||||
|
"return_module": "Users",
|
||||||
|
"return_action": "Login",
|
||||||
|
"user_name": "TEST", # 在这里填入真实的用户名
|
||||||
|
"user_password": "test", # 在这里填入真实的密码
|
||||||
|
"login_theme": "newskin"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 列表查询参数
|
||||||
|
list_payload = {
|
||||||
|
"module": "SalesOrder",
|
||||||
|
"action": "SalesOrderAjax",
|
||||||
|
"file": "ListViewData",
|
||||||
|
"sorder": "",
|
||||||
|
"start": "1",
|
||||||
|
"pagesize": "100", # 设置抓取数量
|
||||||
|
"actionId": "", # 稍后自动填充
|
||||||
|
"isFilter": "true",
|
||||||
|
"search[viewscope]": "all_to_me",
|
||||||
|
"search[viewname]": "324126",
|
||||||
|
# 筛选条件
|
||||||
|
"filter[Fields0]": "subject",
|
||||||
|
"filter[Condition0]": "cts",
|
||||||
|
"filter[Srch_value0]": "W25A",
|
||||||
|
"filter[type0]": "text",
|
||||||
|
"filter[dateCondition1]": "prevfy",
|
||||||
|
"filter[Fields1]": "duedate",
|
||||||
|
"filter[Condition1]": "btwa",
|
||||||
|
"filter[Srch_value1]": "2025-01-01,2025-12-31",
|
||||||
|
"filter[type1]": "date",
|
||||||
|
"filter[Fields2]": "subject",
|
||||||
|
"filter[Condition2]": "dcts",
|
||||||
|
"filter[Srch_value2]": "取消",
|
||||||
|
"filter[type2]": "text",
|
||||||
|
"filter[search_cnt]": "3",
|
||||||
|
"filter[matchtype]": "all"
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Referer": "http://111.198.24.44:88/index.php?module=SalesOrder&action=index"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ================= 2. 辅助函数 =================
|
||||||
|
def get_current_action_id():
|
||||||
|
"""生成当前时间的13位时间戳"""
|
||||||
|
return int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_html_tags(text):
|
||||||
|
"""清洗HTML标签,保留文本内容"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 移除HTML标签
|
||||||
|
clean_text = re.sub(r'<[^>]+>', ' ', text)
|
||||||
|
|
||||||
|
# 替换HTML实体
|
||||||
|
clean_text = clean_text.replace(' ', ' ')
|
||||||
|
|
||||||
|
# 合并多个空格和换行符
|
||||||
|
clean_text = re.sub(r'\s+', ' ', clean_text)
|
||||||
|
|
||||||
|
# 去除首尾空格
|
||||||
|
clean_text = clean_text.strip()
|
||||||
|
|
||||||
|
return clean_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_html_content(html_content, xpath):
|
||||||
|
"""从HTML中提取指定XPath的内容"""
|
||||||
|
try:
|
||||||
|
# 解析HTML
|
||||||
|
tree = html.fromstring(html_content)
|
||||||
|
|
||||||
|
# 尝试提取指定XPath的内容
|
||||||
|
elements = tree.xpath(xpath)
|
||||||
|
|
||||||
|
if elements:
|
||||||
|
# 获取元素的HTML内容
|
||||||
|
element_html = html.tostring(elements[0], encoding='unicode', pretty_print=True)
|
||||||
|
|
||||||
|
# 清洗HTML标签
|
||||||
|
cleaned_text = clean_html_tags(element_html)
|
||||||
|
|
||||||
|
# 同时保留原始HTML和清洗后的文本
|
||||||
|
return {
|
||||||
|
"raw_html": element_html,
|
||||||
|
"cleaned_text": cleaned_text
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ 未找到XPath: {xpath}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ HTML解析错误: {e}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_html_detail(session, record_id, xpath):
|
||||||
|
"""获取HTML页面详情并提取指定XPath内容"""
|
||||||
|
try:
|
||||||
|
# 构造HTML详情页URL
|
||||||
|
html_url = f"http://111.198.24.44:88/index.php?module=SalesOrder&action=DetailView&record={record_id}"
|
||||||
|
|
||||||
|
# 获取HTML页面
|
||||||
|
html_response = session.get(html_url, headers=headers)
|
||||||
|
|
||||||
|
if html_response.status_code == 200:
|
||||||
|
# 提取指定XPath的内容
|
||||||
|
extracted_content = extract_html_content(html_response.content, xpath)
|
||||||
|
return extracted_content
|
||||||
|
else:
|
||||||
|
print(f" ❌ HTML页面请求失败: HTTP {html_response.status_code}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ 获取HTML详情失败: {e}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ================= 3. 主程序逻辑 =================
|
||||||
|
def main():
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
# 指定要提取的XPath
|
||||||
|
target_xpath = "/html/body/div[1]/div/div[2]/div[2]/form/div[1]/div[1]/div[2]"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# --- 第一步:登录 ---
|
||||||
|
print("1. 正在登录...")
|
||||||
|
login_response = session.post(base_url, data=login_payload, headers=headers)
|
||||||
|
|
||||||
|
# 检查是否拿到 Cookie
|
||||||
|
if 'PHPSESSID' not in session.cookies:
|
||||||
|
print("⚠️ 警告:未检测到 PHPSESSID,登录可能失败,后续操作可能会出错。")
|
||||||
|
else:
|
||||||
|
print(" ✅ 登录成功,Cookie 已获取。")
|
||||||
|
|
||||||
|
# --- 第二步:获取列表 ---
|
||||||
|
print("\n2. 正在获取订单列表...")
|
||||||
|
list_payload['actionId'] = get_current_action_id()
|
||||||
|
|
||||||
|
list_resp = session.post(base_url, data=list_payload, headers=headers)
|
||||||
|
|
||||||
|
try:
|
||||||
|
list_data = list_resp.json()
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print("❌ 错误:列表接口返回的不是 JSON 数据。")
|
||||||
|
print("返回内容预览:", list_resp.text[:200])
|
||||||
|
return
|
||||||
|
|
||||||
|
# === 智能解析列表数据 ===
|
||||||
|
orders = []
|
||||||
|
|
||||||
|
# 策略 A: 如果返回的是直接的列表
|
||||||
|
if isinstance(list_data, list):
|
||||||
|
orders = list_data
|
||||||
|
|
||||||
|
# 策略 B: 如果返回的是字典
|
||||||
|
elif isinstance(list_data, dict):
|
||||||
|
# 1. 尝试查找常见的列表键名
|
||||||
|
found_key = False
|
||||||
|
possible_keys = ['entries', 'rows', 'data', 'records', 'list']
|
||||||
|
for key in possible_keys:
|
||||||
|
if key in list_data and isinstance(list_data[key], list):
|
||||||
|
orders = list_data[key]
|
||||||
|
print(f" [系统] 自动在键名 '{key}' 下找到数据列表。")
|
||||||
|
found_key = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# 2. 如果没找到键名,尝试智能提取
|
||||||
|
if not found_key:
|
||||||
|
print(" [系统] 未找到标准键名,正在尝试智能提取字典对象...")
|
||||||
|
# 遍历字典的所有值,找出那些看起来像"订单"的字典
|
||||||
|
for val in list_data.values():
|
||||||
|
if isinstance(val, dict) and ('crmid' in val or 'salesorderid' in val or 'id' in val):
|
||||||
|
orders.append(val)
|
||||||
|
|
||||||
|
if not orders:
|
||||||
|
print("❌ 错误:未能提取到任何订单数据。")
|
||||||
|
# 调试用:保存一下原始返回数据看看结构
|
||||||
|
with open("debug_list_response.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(list_data, f, ensure_ascii=False, indent=4)
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f" ✅ 成功提取到 {len(orders)} 条有效订单。")
|
||||||
|
|
||||||
|
# --- 第三步:循环获取详情 ---
|
||||||
|
print("\n3. 开始逐个获取订单详情...")
|
||||||
|
success_count = 0
|
||||||
|
|
||||||
|
for index, order in enumerate(orders):
|
||||||
|
# 防御性检查:确保 order 是字典
|
||||||
|
if not isinstance(order, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 1. 获取 ID (尝试多个可能的字段名)
|
||||||
|
record_id = order.get('crmid') or order.get('salesorderid') or order.get('id')
|
||||||
|
|
||||||
|
if not record_id:
|
||||||
|
print(f" ⚠️ 第 {index + 1} 条数据没有找到 ID,跳过。")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\n [{index + 1}/{len(orders)}] 处理订单 ID: {record_id}")
|
||||||
|
|
||||||
|
# 2. 获取JSON详情 (产品详情)
|
||||||
|
json_detail = None
|
||||||
|
detail_payload = {
|
||||||
|
"module": "Plugins",
|
||||||
|
"pluginName": "DetailProductTable",
|
||||||
|
"action": "getTableData",
|
||||||
|
"moduleName": "SalesOrder",
|
||||||
|
"record": record_id,
|
||||||
|
"actionId": get_current_action_id(),
|
||||||
|
"isTool": "1"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 请求JSON详情
|
||||||
|
detail_resp = session.post(base_url, data=detail_payload, headers=headers)
|
||||||
|
json_detail = detail_resp.json()
|
||||||
|
print(f" ✅ JSON详情获取成功")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ JSON详情获取失败: {e}")
|
||||||
|
json_detail = {"error": str(e)}
|
||||||
|
|
||||||
|
# 3. 获取HTML详情并提取指定XPath内容
|
||||||
|
print(f" 正在获取HTML详情...")
|
||||||
|
html_content = fetch_html_detail(session, record_id, target_xpath)
|
||||||
|
|
||||||
|
# 4. 将详情合并到原数据中
|
||||||
|
order['json_details'] = json_detail
|
||||||
|
order['html_details'] = html_content
|
||||||
|
|
||||||
|
# 5. 创建一个合并的字段,方便查看
|
||||||
|
order['combined_data'] = {
|
||||||
|
"crmid": record_id,
|
||||||
|
"json_data": json_detail,
|
||||||
|
"html_extracted_text": html_content.get("cleaned_text", ""),
|
||||||
|
"html_raw": html_content.get("raw_html", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
success_count += 1
|
||||||
|
print(f" ✅ 订单 {record_id} 处理完成")
|
||||||
|
|
||||||
|
# 礼貌性延时,避免请求过快
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# --- 第四步:保存结果 ---
|
||||||
|
print(f"\n4. 正在保存结果...")
|
||||||
|
|
||||||
|
# 创建存储目录
|
||||||
|
output_dir = "crm_data"
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 保存完整的合并数据
|
||||||
|
full_filename = os.path.join(output_dir, "all_orders_combined.json")
|
||||||
|
with open(full_filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(orders, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
# 同时按crmid分别存储
|
||||||
|
print(f" 正在按CRM ID分别存储文件...")
|
||||||
|
for order in orders:
|
||||||
|
record_id = order.get('crmid') or order.get('salesorderid') or order.get('id')
|
||||||
|
if record_id:
|
||||||
|
# 单独保存每个crmid的数据
|
||||||
|
single_filename = os.path.join(output_dir, f"crm_{record_id}.json")
|
||||||
|
with open(single_filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(order, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
# 保存提取的文本内容为文本文件,便于查看
|
||||||
|
text_filename = os.path.join(output_dir, "extracted_texts.txt")
|
||||||
|
with open(text_filename, 'w', encoding='utf-8') as f:
|
||||||
|
f.write("=== 提取的HTML文本内容 ===\n\n")
|
||||||
|
for order in orders:
|
||||||
|
record_id = order.get('crmid') or order.get('salesorderid') or order.get('id')
|
||||||
|
if record_id:
|
||||||
|
extracted_text = order.get('html_details', {}).get('cleaned_text', '')
|
||||||
|
if extracted_text:
|
||||||
|
f.write(f"\n--- CRM ID: {record_id} ---\n")
|
||||||
|
f.write(f"{extracted_text}\n")
|
||||||
|
f.write("-" * 50 + "\n")
|
||||||
|
|
||||||
|
print(f"\n✅ 全部完成!")
|
||||||
|
print(f" 成功处理: {success_count}/{len(orders)} 个订单")
|
||||||
|
print(f" 文件保存目录: {os.path.abspath(output_dir)}")
|
||||||
|
print(f" 主要文件:")
|
||||||
|
print(f" - {full_filename}")
|
||||||
|
print(f" - {text_filename}")
|
||||||
|
print(f" - 按CRM ID单独存储的 {success_count} 个JSON文件")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ 程序发生未捕获的错误: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
420
搜索获取数据.py
Normal file
420
搜索获取数据.py
Normal file
@ -0,0 +1,420 @@
|
|||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
from lxml import html
|
||||||
|
import re
|
||||||
|
|
||||||
|
# ================= 1. 配置区域 =================
|
||||||
|
base_url = "http://111.198.24.44:88/index.php"
|
||||||
|
|
||||||
|
# 登录参数
|
||||||
|
login_payload = {
|
||||||
|
"module": "Users",
|
||||||
|
"action": "Authenticate",
|
||||||
|
"return_module": "Users",
|
||||||
|
"return_action": "Login",
|
||||||
|
"user_name": "TEST", # 在这里填入真实的用户名
|
||||||
|
"user_password": "test", # 在这里填入真实的密码
|
||||||
|
"login_theme": "newskin"
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Referer": "http://111.198.24.44:88/index.php?module=SalesOrder&action=index"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ================= 2. 辅助函数 =================
|
||||||
|
def get_current_action_id():
|
||||||
|
"""生成当前时间的13位时间戳"""
|
||||||
|
return int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_html_tags(text):
|
||||||
|
"""清洗HTML标签,保留文本内容"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 移除HTML标签
|
||||||
|
clean_text = re.sub(r'<[^>]+>', ' ', text)
|
||||||
|
|
||||||
|
# 替换HTML实体
|
||||||
|
clean_text = clean_text.replace(' ', ' ')
|
||||||
|
|
||||||
|
# 合并多个空格和换行符
|
||||||
|
clean_text = re.sub(r'\s+', ' ', clean_text)
|
||||||
|
|
||||||
|
# 去除首尾空格
|
||||||
|
clean_text = clean_text.strip()
|
||||||
|
|
||||||
|
return clean_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_html_content(html_content, xpath):
|
||||||
|
"""从HTML中提取指定XPath的内容"""
|
||||||
|
try:
|
||||||
|
# 解析HTML
|
||||||
|
tree = html.fromstring(html_content)
|
||||||
|
|
||||||
|
# 尝试提取指定XPath的内容
|
||||||
|
elements = tree.xpath(xpath)
|
||||||
|
|
||||||
|
if elements:
|
||||||
|
# 获取元素的HTML内容
|
||||||
|
element_html = html.tostring(elements[0], encoding='unicode', pretty_print=True)
|
||||||
|
|
||||||
|
# 清洗HTML标签
|
||||||
|
cleaned_text = clean_html_tags(element_html)
|
||||||
|
|
||||||
|
# 同时保留原始HTML和清洗后的文本
|
||||||
|
return {
|
||||||
|
"raw_html": element_html,
|
||||||
|
"cleaned_text": cleaned_text
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ 未找到XPath: {xpath}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ HTML解析错误: {e}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_html_detail(session, record_id, xpath):
|
||||||
|
"""获取HTML页面详情并提取指定XPath内容"""
|
||||||
|
try:
|
||||||
|
# 构造HTML详情页URL
|
||||||
|
html_url = f"http://111.198.24.44:88/index.php?module=SalesOrder&action=DetailView&record={record_id}"
|
||||||
|
|
||||||
|
# 获取HTML页面
|
||||||
|
html_response = session.get(html_url, headers=headers)
|
||||||
|
|
||||||
|
if html_response.status_code == 200:
|
||||||
|
# 提取指定XPath的内容
|
||||||
|
extracted_content = extract_html_content(html_response.content, xpath)
|
||||||
|
return extracted_content
|
||||||
|
else:
|
||||||
|
print(f" ❌ HTML页面请求失败: HTTP {html_response.status_code}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ 获取HTML详情失败: {e}")
|
||||||
|
return {
|
||||||
|
"raw_html": "",
|
||||||
|
"cleaned_text": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_crmid_from_search_result(html_content):
|
||||||
|
"""从搜索结果页面提取CRM ID - 修正版本"""
|
||||||
|
crmids = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 解析HTML
|
||||||
|
tree = html.fromstring(html_content)
|
||||||
|
|
||||||
|
# 首先找到SalesOrder模块的div
|
||||||
|
sales_order_div = tree.xpath('//div[@class="collapse in" and @id="collapse-SalesOrder"]')
|
||||||
|
|
||||||
|
if not sales_order_div:
|
||||||
|
print(" ⚠️ 未找到SalesOrder模块的搜索结果")
|
||||||
|
# 保存HTML用于调试
|
||||||
|
with open("debug_no_salesorder.html", "w", encoding="utf-8") as f:
|
||||||
|
f.write(html.tostring(tree, encoding='unicode', pretty_print=True))
|
||||||
|
return crmids
|
||||||
|
|
||||||
|
print(" ✅ 找到SalesOrder模块")
|
||||||
|
|
||||||
|
# 在新的div下查找特定的XPath模式
|
||||||
|
# 基础XPath:/html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]/div[2]/div[2]/a
|
||||||
|
# 我们需要找到所有符合这个模式的链接,其中倒数第二个div的索引会变化
|
||||||
|
|
||||||
|
# 方法1:使用通用XPath匹配模式
|
||||||
|
# 匹配所有符合 /html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]/div[n]/div[2]/a 的链接
|
||||||
|
# 其中n从2开始递增
|
||||||
|
|
||||||
|
# 构建通用XPath:查找所有在特定层级下的a标签
|
||||||
|
base_path = "/html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]"
|
||||||
|
|
||||||
|
# 查找所有可能的div[n]层级
|
||||||
|
n = 2
|
||||||
|
while True:
|
||||||
|
xpath_pattern = f"{base_path}/div[{n}]/div[2]/a"
|
||||||
|
elements = tree.xpath(xpath_pattern)
|
||||||
|
|
||||||
|
if not elements:
|
||||||
|
# 如果这个n没有找到,尝试下一个n+1
|
||||||
|
# 但我们先检查一下,如果n>10还没找到,可能就没有了
|
||||||
|
if n > 20: # 设置一个上限
|
||||||
|
break
|
||||||
|
n += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 找到元素,提取record值
|
||||||
|
for element in elements:
|
||||||
|
onclick_attr = element.get('onclick', '')
|
||||||
|
if onclick_attr:
|
||||||
|
# 从onclick中提取record值
|
||||||
|
match = re.search(r"record=(\d+)", onclick_attr)
|
||||||
|
if match:
|
||||||
|
crmid = match.group(1)
|
||||||
|
if crmid not in crmids:
|
||||||
|
crmids.append(crmid)
|
||||||
|
print(f" 从XPath {xpath_pattern} 找到CRM ID: {crmid}")
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
|
||||||
|
# 方法2:备用方法 - 查找所有包含module=SalesOrder的链接
|
||||||
|
if not crmids:
|
||||||
|
print(" 尝试备用方法查找CRM ID...")
|
||||||
|
salesorder_links = tree.xpath('//a[contains(@onclick, "module=SalesOrder")]')
|
||||||
|
for link in salesorder_links:
|
||||||
|
onclick_attr = link.get('onclick', '')
|
||||||
|
if onclick_attr:
|
||||||
|
match = re.search(r"record=(\d+)", onclick_attr)
|
||||||
|
if match:
|
||||||
|
crmid = match.group(1)
|
||||||
|
if crmid not in crmids:
|
||||||
|
crmids.append(crmid)
|
||||||
|
|
||||||
|
print(f" 备用方法找到 {len(crmids)} 个CRM ID")
|
||||||
|
|
||||||
|
# 去重并返回
|
||||||
|
unique_crmids = list(set(crmids))
|
||||||
|
print(f" 去重后找到 {len(unique_crmids)} 个唯一的CRM ID")
|
||||||
|
|
||||||
|
return unique_crmids
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ 解析搜索结果失败: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return crmids
|
||||||
|
|
||||||
|
|
||||||
|
def perform_search(session, query_string):
|
||||||
|
"""执行搜索并返回CRM ID列表"""
|
||||||
|
try:
|
||||||
|
# 构造搜索URL
|
||||||
|
search_url = f"http://111.198.24.44:88/index.php?module=Home&action=UnifiedSearch&selectedmodule=undefined&query_string={query_string}"
|
||||||
|
|
||||||
|
print(f" 正在搜索: {query_string}")
|
||||||
|
print(f" 搜索URL: {search_url}")
|
||||||
|
|
||||||
|
# 获取搜索结果页面
|
||||||
|
search_response = session.get(search_url, headers=headers)
|
||||||
|
|
||||||
|
if search_response.status_code != 200:
|
||||||
|
print(f" ❌ 搜索请求失败: HTTP {search_response.status_code}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 保存搜索结果用于调试
|
||||||
|
with open("debug_search_result.html", "w", encoding="utf-8") as f:
|
||||||
|
f.write(search_response.text)
|
||||||
|
print(" 搜索结果已保存到 debug_search_result.html")
|
||||||
|
|
||||||
|
# 提取CRM ID
|
||||||
|
crmids = extract_crmid_from_search_result(search_response.content)
|
||||||
|
|
||||||
|
return crmids
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ 搜索失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# ================= 3. 主程序逻辑 =================
|
||||||
|
def main():
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
# 指定要提取的XPath
|
||||||
|
target_xpath = "/html/body/div[1]/div/div[2]/div[2]/form/div[1]/div[1]/div[2]"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# --- 第一步:登录 ---
|
||||||
|
print("1. 正在登录...")
|
||||||
|
login_response = session.post(base_url, data=login_payload, headers=headers)
|
||||||
|
|
||||||
|
# 检查是否拿到 Cookie
|
||||||
|
if 'PHPSESSID' not in session.cookies:
|
||||||
|
print("⚠️ 警告:未检测到 PHPSESSID,登录可能失败,后续操作可能会出错。")
|
||||||
|
else:
|
||||||
|
print(" ✅ 登录成功,Cookie 已获取。")
|
||||||
|
|
||||||
|
# --- 第二步:获取用户搜索输入 ---
|
||||||
|
print("\n2. 请输入搜索内容:")
|
||||||
|
query_string = input(" 搜索关键词: ").strip()
|
||||||
|
|
||||||
|
if not query_string:
|
||||||
|
print(" ❌ 未输入搜索内容,程序退出。")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 对查询字符串进行URL编码
|
||||||
|
import urllib.parse
|
||||||
|
encoded_query = urllib.parse.quote(query_string)
|
||||||
|
|
||||||
|
# --- 第三步:执行搜索并提取CRM ID ---
|
||||||
|
print(f"\n3. 正在执行搜索并提取CRM ID...")
|
||||||
|
crmids = perform_search(session, encoded_query)
|
||||||
|
|
||||||
|
if not crmids:
|
||||||
|
print(" ❌ 未找到任何CRM ID,程序退出。")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f" ✅ 成功提取到 {len(crmids)} 个CRM ID: {crmids}")
|
||||||
|
|
||||||
|
# --- 第四步:循环获取每个CRM ID的详情 ---
|
||||||
|
print(f"\n4. 开始逐个获取订单详情...")
|
||||||
|
success_count = 0
|
||||||
|
orders_data = []
|
||||||
|
|
||||||
|
for index, crmid in enumerate(crmids):
|
||||||
|
print(f"\n [{index + 1}/{len(crmids)}] 处理CRM ID: {crmid}")
|
||||||
|
|
||||||
|
# 1. 获取JSON详情 (产品详情)
|
||||||
|
json_detail = None
|
||||||
|
detail_payload = {
|
||||||
|
"module": "Plugins",
|
||||||
|
"pluginName": "DetailProductTable",
|
||||||
|
"action": "getTableData",
|
||||||
|
"moduleName": "SalesOrder",
|
||||||
|
"record": crmid,
|
||||||
|
"actionId": get_current_action_id(),
|
||||||
|
"isTool": "1"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 请求JSON详情
|
||||||
|
detail_resp = session.post(base_url, data=detail_payload, headers=headers)
|
||||||
|
json_detail = detail_resp.json()
|
||||||
|
print(f" ✅ JSON详情获取成功")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ JSON详情获取失败: {e}")
|
||||||
|
json_detail = {"error": str(e)}
|
||||||
|
|
||||||
|
# 2. 获取HTML详情并提取指定XPath内容
|
||||||
|
print(f" 正在获取HTML详情...")
|
||||||
|
html_content = fetch_html_detail(session, crmid, target_xpath)
|
||||||
|
|
||||||
|
# 3. 构建订单数据
|
||||||
|
order_data = {
|
||||||
|
"crmid": crmid,
|
||||||
|
"json_details": json_detail,
|
||||||
|
"html_details": html_content,
|
||||||
|
"combined_data": {
|
||||||
|
"crmid": crmid,
|
||||||
|
"json_data": json_detail,
|
||||||
|
"html_extracted_text": html_content.get("cleaned_text", ""),
|
||||||
|
"html_raw": html_content.get("raw_html", "")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
orders_data.append(order_data)
|
||||||
|
success_count += 1
|
||||||
|
print(f" ✅ CRM ID {crmid} 处理完成")
|
||||||
|
|
||||||
|
# 礼貌性延时,避免请求过快
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# --- 第五步:保存结果 ---
|
||||||
|
print(f"\n5. 正在保存结果...")
|
||||||
|
|
||||||
|
# 创建存储目录
|
||||||
|
import datetime
|
||||||
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
output_dir = f"crm_data_search_{timestamp}"
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 保存搜索查询信息
|
||||||
|
search_info = {
|
||||||
|
"query_string": query_string,
|
||||||
|
"encoded_query": encoded_query,
|
||||||
|
"search_time": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"crmids_found": crmids,
|
||||||
|
"total_count": len(crmids)
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(os.path.join(output_dir, "search_info.json"), 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(search_info, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
# 保存完整的合并数据
|
||||||
|
full_filename = os.path.join(output_dir, "all_orders_combined.json")
|
||||||
|
with open(full_filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(orders_data, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
# 同时按crmid分别存储
|
||||||
|
print(f" 正在按CRM ID分别存储文件...")
|
||||||
|
for order in orders_data:
|
||||||
|
record_id = order.get('crmid')
|
||||||
|
if record_id:
|
||||||
|
# 单独保存每个crmid的数据
|
||||||
|
single_filename = os.path.join(output_dir, f"crm_{record_id}.json")
|
||||||
|
with open(single_filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(order, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
# 保存提取的文本内容为文本文件,便于查看
|
||||||
|
text_filename = os.path.join(output_dir, "extracted_texts.txt")
|
||||||
|
with open(text_filename, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(f"=== 搜索查询: {query_string} ===\n")
|
||||||
|
f.write(f"=== 提取时间: {time.strftime('%Y-%m-%d %H:%M:%S')} ===\n")
|
||||||
|
f.write(f"=== 共找到 {len(crmids)} 个结果 ===\n\n")
|
||||||
|
|
||||||
|
for order in orders_data:
|
||||||
|
record_id = order.get('crmid')
|
||||||
|
if record_id:
|
||||||
|
extracted_text = order.get('html_details', {}).get('cleaned_text', '')
|
||||||
|
if extracted_text:
|
||||||
|
f.write(f"\n--- CRM ID: {record_id} ---\n")
|
||||||
|
f.write(f"{extracted_text}\n")
|
||||||
|
f.write("-" * 50 + "\n")
|
||||||
|
|
||||||
|
# 创建CSV格式的摘要文件
|
||||||
|
csv_filename = os.path.join(output_dir, "summary.csv")
|
||||||
|
with open(csv_filename, 'w', encoding='utf-8') as f:
|
||||||
|
f.write("CRM ID,提取文本长度,JSON数据状态\n")
|
||||||
|
for order in orders_data:
|
||||||
|
record_id = order.get('crmid')
|
||||||
|
text_length = len(order.get('html_details', {}).get('cleaned_text', ''))
|
||||||
|
json_status = "成功" if order.get('json_details') and not order.get('json_details').get(
|
||||||
|
'error') else "失败"
|
||||||
|
f.write(f"{record_id},{text_length},{json_status}\n")
|
||||||
|
|
||||||
|
print(f"\n✅ 全部完成!")
|
||||||
|
print(f" 成功处理: {success_count}/{len(crmids)} 个CRM ID")
|
||||||
|
print(f" 文件保存目录: {os.path.abspath(output_dir)}")
|
||||||
|
print(f" 主要文件:")
|
||||||
|
print(f" - search_info.json (搜索信息)")
|
||||||
|
print(f" - all_orders_combined.json (所有数据)")
|
||||||
|
print(f" - extracted_texts.txt (提取的文本)")
|
||||||
|
print(f" - summary.csv (数据摘要)")
|
||||||
|
print(f" - 按CRM ID单独存储的 {success_count} 个JSON文件")
|
||||||
|
|
||||||
|
# 显示提取的文本预览
|
||||||
|
print(f"\n=== 提取文本预览 ===")
|
||||||
|
for i, order in enumerate(orders_data[:3]): # 只显示前3个
|
||||||
|
record_id = order.get('crmid')
|
||||||
|
extracted_text = order.get('html_details', {}).get('cleaned_text', '')
|
||||||
|
preview = extracted_text[:100] + "..." if len(extracted_text) > 100 else extracted_text
|
||||||
|
print(f"CRM ID {record_id}: {preview}")
|
||||||
|
|
||||||
|
if len(orders_data) > 3:
|
||||||
|
print(f"... 还有 {len(orders_data) - 3} 个未显示")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ 程序发生未捕获的错误: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -12,7 +12,7 @@ login_payload = {
|
|||||||
"return_module": "Users",
|
"return_module": "Users",
|
||||||
"return_action": "Login",
|
"return_action": "Login",
|
||||||
"user_name": "TEST", # 在这里填入真实的用户名
|
"user_name": "TEST", # 在这里填入真实的用户名
|
||||||
"user_password": "test", # 在这里填入真实的密码
|
"user_password": "***", # 在这里填入真实的密码
|
||||||
"login_theme": "newskin"
|
"login_theme": "newskin"
|
||||||
}
|
}
|
||||||
|
|
||||||
Reference in New Issue
Block a user