import requests import json import time import os from lxml import html import re # ================= 1. 配置区域 ================= base_url = "http://111.198.24.44:88/index.php" # 登录参数 login_payload = { "module": "Users", "action": "Authenticate", "return_module": "Users", "return_action": "Login", "user_name": "TEST", # 在这里填入真实的用户名 "user_password": "test", # 在这里填入真实的密码 "login_theme": "newskin" } headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Referer": "http://111.198.24.44:88/index.php?module=SalesOrder&action=index" } # ================= 2. 辅助函数 ================= def get_current_action_id(): """生成当前时间的13位时间戳""" return int(time.time() * 1000) def clean_html_tags(text): """清洗HTML标签,保留文本内容""" if not text: return "" # 移除HTML标签 clean_text = re.sub(r'<[^>]+>', ' ', text) # 替换HTML实体 clean_text = clean_text.replace(' ', ' ') # 合并多个空格和换行符 clean_text = re.sub(r'\s+', ' ', clean_text) # 去除首尾空格 clean_text = clean_text.strip() return clean_text def extract_html_content(html_content, xpath): """从HTML中提取指定XPath的内容""" try: # 解析HTML tree = html.fromstring(html_content) # 尝试提取指定XPath的内容 elements = tree.xpath(xpath) if elements: # 获取元素的HTML内容 element_html = html.tostring(elements[0], encoding='unicode', pretty_print=True) # 清洗HTML标签 cleaned_text = clean_html_tags(element_html) # 同时保留原始HTML和清洗后的文本 return { "raw_html": element_html, "cleaned_text": cleaned_text } else: print(f" ⚠️ 未找到XPath: {xpath}") return { "raw_html": "", "cleaned_text": "" } except Exception as e: print(f" ❌ HTML解析错误: {e}") return { "raw_html": "", "cleaned_text": "" } def fetch_html_detail(session, record_id, xpath): """获取HTML页面详情并提取指定XPath内容""" try: # 构造HTML详情页URL html_url = f"http://111.198.24.44:88/index.php?module=SalesOrder&action=DetailView&record={record_id}" # 获取HTML页面 html_response = session.get(html_url, headers=headers) if html_response.status_code == 200: # 提取指定XPath的内容 extracted_content = extract_html_content(html_response.content, xpath) return extracted_content else: print(f" ❌ HTML页面请求失败: HTTP {html_response.status_code}") return { "raw_html": "", "cleaned_text": "" } except Exception as e: print(f" ❌ 获取HTML详情失败: {e}") return { "raw_html": "", "cleaned_text": "" } def extract_crmid_from_search_result(html_content): """从搜索结果页面提取CRM ID - 修正版本""" crmids = [] try: # 解析HTML tree = html.fromstring(html_content) # 首先找到SalesOrder模块的div sales_order_div = tree.xpath('//div[@class="collapse in" and @id="collapse-SalesOrder"]') if not sales_order_div: print(" ⚠️ 未找到SalesOrder模块的搜索结果") # 保存HTML用于调试 with open("debug_no_salesorder.html", "w", encoding="utf-8") as f: f.write(html.tostring(tree, encoding='unicode', pretty_print=True)) return crmids print(" ✅ 找到SalesOrder模块") # 在新的div下查找特定的XPath模式 # 基础XPath:/html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]/div[2]/div[2]/a # 我们需要找到所有符合这个模式的链接,其中倒数第二个div的索引会变化 # 方法1:使用通用XPath匹配模式 # 匹配所有符合 /html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]/div[n]/div[2]/a 的链接 # 其中n从2开始递增 # 构建通用XPath:查找所有在特定层级下的a标签 base_path = "/html/body/div[1]/div/div[2]/div/div/div[5]/div/div/div/div/div[3]" # 查找所有可能的div[n]层级 n = 2 while True: xpath_pattern = f"{base_path}/div[{n}]/div[2]/a" elements = tree.xpath(xpath_pattern) if not elements: # 如果这个n没有找到,尝试下一个n+1 # 但我们先检查一下,如果n>10还没找到,可能就没有了 if n > 20: # 设置一个上限 break n += 1 continue # 找到元素,提取record值 for element in elements: onclick_attr = element.get('onclick', '') if onclick_attr: # 从onclick中提取record值 match = re.search(r"record=(\d+)", onclick_attr) if match: crmid = match.group(1) if crmid not in crmids: crmids.append(crmid) print(f" 从XPath {xpath_pattern} 找到CRM ID: {crmid}") n += 1 # 方法2:备用方法 - 查找所有包含module=SalesOrder的链接 if not crmids: print(" 尝试备用方法查找CRM ID...") salesorder_links = tree.xpath('//a[contains(@onclick, "module=SalesOrder")]') for link in salesorder_links: onclick_attr = link.get('onclick', '') if onclick_attr: match = re.search(r"record=(\d+)", onclick_attr) if match: crmid = match.group(1) if crmid not in crmids: crmids.append(crmid) print(f" 备用方法找到 {len(crmids)} 个CRM ID") # 去重并返回 unique_crmids = list(set(crmids)) print(f" 去重后找到 {len(unique_crmids)} 个唯一的CRM ID") return unique_crmids except Exception as e: print(f" ❌ 解析搜索结果失败: {e}") import traceback traceback.print_exc() return crmids def perform_search(session, query_string): """执行搜索并返回CRM ID列表""" try: # 构造搜索URL search_url = f"http://111.198.24.44:88/index.php?module=Home&action=UnifiedSearch&selectedmodule=undefined&query_string={query_string}" print(f" 正在搜索: {query_string}") print(f" 搜索URL: {search_url}") # 获取搜索结果页面 search_response = session.get(search_url, headers=headers) if search_response.status_code != 200: print(f" ❌ 搜索请求失败: HTTP {search_response.status_code}") return [] # 保存搜索结果用于调试 with open("debug_search_result.html", "w", encoding="utf-8") as f: f.write(search_response.text) print(" 搜索结果已保存到 debug_search_result.html") # 提取CRM ID crmids = extract_crmid_from_search_result(search_response.content) return crmids except Exception as e: print(f" ❌ 搜索失败: {e}") return [] # ================= 3. 主程序逻辑 ================= def main(): session = requests.Session() # 指定要提取的XPath target_xpath = "/html/body/div[1]/div/div[2]/div[2]/form/div[1]/div[1]/div[2]" try: # --- 第一步:登录 --- print("1. 正在登录...") login_response = session.post(base_url, data=login_payload, headers=headers) # 检查是否拿到 Cookie if 'PHPSESSID' not in session.cookies: print("⚠️ 警告:未检测到 PHPSESSID,登录可能失败,后续操作可能会出错。") else: print(" ✅ 登录成功,Cookie 已获取。") # --- 第二步:获取用户搜索输入 --- print("\n2. 请输入搜索内容:") query_string = input(" 搜索关键词: ").strip() if not query_string: print(" ❌ 未输入搜索内容,程序退出。") return # 对查询字符串进行URL编码 import urllib.parse encoded_query = urllib.parse.quote(query_string) # --- 第三步:执行搜索并提取CRM ID --- print(f"\n3. 正在执行搜索并提取CRM ID...") crmids = perform_search(session, encoded_query) if not crmids: print(" ❌ 未找到任何CRM ID,程序退出。") return print(f" ✅ 成功提取到 {len(crmids)} 个CRM ID: {crmids}") # --- 第四步:循环获取每个CRM ID的详情 --- print(f"\n4. 开始逐个获取订单详情...") success_count = 0 orders_data = [] for index, crmid in enumerate(crmids): print(f"\n [{index + 1}/{len(crmids)}] 处理CRM ID: {crmid}") # 1. 获取JSON详情 (产品详情) json_detail = None detail_payload = { "module": "Plugins", "pluginName": "DetailProductTable", "action": "getTableData", "moduleName": "SalesOrder", "record": crmid, "actionId": get_current_action_id(), "isTool": "1" } try: # 请求JSON详情 detail_resp = session.post(base_url, data=detail_payload, headers=headers) json_detail = detail_resp.json() print(f" ✅ JSON详情获取成功") except Exception as e: print(f" ❌ JSON详情获取失败: {e}") json_detail = {"error": str(e)} # 2. 获取HTML详情并提取指定XPath内容 print(f" 正在获取HTML详情...") html_content = fetch_html_detail(session, crmid, target_xpath) # 3. 构建订单数据 order_data = { "crmid": crmid, "json_details": json_detail, "html_details": html_content, "combined_data": { "crmid": crmid, "json_data": json_detail, "html_extracted_text": html_content.get("cleaned_text", ""), "html_raw": html_content.get("raw_html", "") } } orders_data.append(order_data) success_count += 1 print(f" ✅ CRM ID {crmid} 处理完成") # 礼貌性延时,避免请求过快 time.sleep(0.5) # --- 第五步:保存结果 --- print(f"\n5. 正在保存结果...") # 创建存储目录 import datetime timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = f"crm_data_search_{timestamp}" os.makedirs(output_dir, exist_ok=True) # 保存搜索查询信息 search_info = { "query_string": query_string, "encoded_query": encoded_query, "search_time": time.strftime("%Y-%m-%d %H:%M:%S"), "crmids_found": crmids, "total_count": len(crmids) } with open(os.path.join(output_dir, "search_info.json"), 'w', encoding='utf-8') as f: json.dump(search_info, f, ensure_ascii=False, indent=4) # 保存完整的合并数据 full_filename = os.path.join(output_dir, "all_orders_combined.json") with open(full_filename, 'w', encoding='utf-8') as f: json.dump(orders_data, f, ensure_ascii=False, indent=4) # 同时按crmid分别存储 print(f" 正在按CRM ID分别存储文件...") for order in orders_data: record_id = order.get('crmid') if record_id: # 单独保存每个crmid的数据 single_filename = os.path.join(output_dir, f"crm_{record_id}.json") with open(single_filename, 'w', encoding='utf-8') as f: json.dump(order, f, ensure_ascii=False, indent=4) # 保存提取的文本内容为文本文件,便于查看 text_filename = os.path.join(output_dir, "extracted_texts.txt") with open(text_filename, 'w', encoding='utf-8') as f: f.write(f"=== 搜索查询: {query_string} ===\n") f.write(f"=== 提取时间: {time.strftime('%Y-%m-%d %H:%M:%S')} ===\n") f.write(f"=== 共找到 {len(crmids)} 个结果 ===\n\n") for order in orders_data: record_id = order.get('crmid') if record_id: extracted_text = order.get('html_details', {}).get('cleaned_text', '') if extracted_text: f.write(f"\n--- CRM ID: {record_id} ---\n") f.write(f"{extracted_text}\n") f.write("-" * 50 + "\n") # 创建CSV格式的摘要文件 csv_filename = os.path.join(output_dir, "summary.csv") with open(csv_filename, 'w', encoding='utf-8') as f: f.write("CRM ID,提取文本长度,JSON数据状态\n") for order in orders_data: record_id = order.get('crmid') text_length = len(order.get('html_details', {}).get('cleaned_text', '')) json_status = "成功" if order.get('json_details') and not order.get('json_details').get( 'error') else "失败" f.write(f"{record_id},{text_length},{json_status}\n") print(f"\n✅ 全部完成!") print(f" 成功处理: {success_count}/{len(crmids)} 个CRM ID") print(f" 文件保存目录: {os.path.abspath(output_dir)}") print(f" 主要文件:") print(f" - search_info.json (搜索信息)") print(f" - all_orders_combined.json (所有数据)") print(f" - extracted_texts.txt (提取的文本)") print(f" - summary.csv (数据摘要)") print(f" - 按CRM ID单独存储的 {success_count} 个JSON文件") # 显示提取的文本预览 print(f"\n=== 提取文本预览 ===") for i, order in enumerate(orders_data[:3]): # 只显示前3个 record_id = order.get('crmid') extracted_text = order.get('html_details', {}).get('cleaned_text', '') preview = extracted_text[:100] + "..." if len(extracted_text) > 100 else extracted_text print(f"CRM ID {record_id}: {preview}") if len(orders_data) > 3: print(f"... 还有 {len(orders_data) - 3} 个未显示") except Exception as e: print(f"\n❌ 程序发生未捕获的错误: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()