🛠️shopify采集脚本 - https://www.poizon.com/search?keyword=New+Balance+327&track_referer_source=m1

🛰️ https://www.poizon.com/search?keyword=New+Balance+327&track_referer_source=m1

poizon_scraper.py获取分类数据

import requests
from bs4 import BeautifulSoup
import time
import logging
from datetime import datetime

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraper.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)

def get_page_urls(page_number):
    base_url = "https://www.poizon.com/search"
    params = {
        "keyword": "New Balance 327",
        "track_referer_source": "m1",
        "page": page_number
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Referer": "https://www.poizon.com/"
    }

    try:
        logging.info(f"开始获取第 {page_number} 页数据")
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()

        # 保存网页内容用于调试
        with open(f'page_{page_number}.html', 'w', encoding='utf-8') as f:
            f.write(response.text)

        soup = BeautifulSoup(response.text, 'html.parser')

        # 尝试多种选择器
        selectors = [
            'div[class*="product"] a[href*="/product/"]',
            'a[href*="/product/"]',
            'div.section div div div div:nth-child(2) div:nth-child(2) div:nth-child(2) a',
            'div[class*="item"] a[href*="/product/"]'
        ]

        urls = []
        for selector in selectors:
            links = soup.select(selector)
            if links:
                logging.info(f"使用选择器 '{selector}' 找到 {len(links)} 个链接")
                for link in links:
                    href = link.get('href')
                    if href and '/product/' in href:
                        full_url = f"@https://www.poizon.com{href}"
                        if full_url not in urls:  # 避免重复
                            urls.append(full_url)
                if urls:
                    break  # 如果找到链接就停止尝试其他选择器

        if not urls:
            logging.warning(f"第 {page_number} 页未找到任何链接，请检查网页结构")
            # 打印一些调试信息
            logging.debug(f"页面标题: {soup.title.string if soup.title else 'No title'}")
            logging.debug(f"页面内容长度: {len(response.text)}")

        logging.info(f"第 {page_number} 页成功获取 {len(urls)} 个链接")
        return urls
    except Exception as e:
        logging.error(f"获取第 {page_number} 页数据时出错: {str(e)}")
        return []

def main():
    start_time = datetime.now()
    logging.info("开始爬取任务")

    all_urls = []
    for page in range(1, 14):  # Pages 1-13
        urls = get_page_urls(page)
        all_urls.extend(urls)
        time.sleep(2)  # 添加延迟避免请求过快

    # 保存结果到文件
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f'poizon_urls_{timestamp}.txt'

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"爬取时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"总链接数: {len(all_urls)}\n")
        f.write("-" * 50 + "\n")
        for url in all_urls:
            f.write(f"{url}\n")

    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()

    logging.info(f"爬取完成，共获取 {len(all_urls)} 个链接")
    logging.info(f"结果已保存到文件: {output_file}")
    logging.info(f"总耗时: {duration:.2f} 秒")

if __name__ == "__main__":
    main()

poizon.py

import requests
import json
import time
import csv
from datetime import datetime
import threading
from queue import Queue
import os
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random
import shutil
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import socks
import socket

def get_product_id_from_url(url):
    """从URL中提取产品ID"""
    # 提取URL最后的数字作为产品ID
    match = re.search(r'-(\d+)$', url)
    if match:
        product_id = match.group(1)
        print(f"找到产品ID: {product_id}")
        return product_id

    # 如果上面的模式匹配失败，尝试匹配最后一个连字符后的数字
    match = re.search(r'-(\d+)(?:/|$)', url)
    if match:
        product_id = match.group(1)
        print(f"找到产品ID: {product_id}")
        return product_id

    print(f"无法从URL提取产品ID: {url}")
    return None

def get_product_data(product_id, max_retries=3, retry_delay=2):
    """使用API获取产品数据"""
    url = "https://app.poizon.com/api/v1/h5/product/detail"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Content-Type': 'application/json',
        'Origin': 'https://www.poizon.com',
        'Referer': 'https://www.poizon.com/',
        'X-Auth-Token': '',
        'Access-Control-Allow-Origin': '*'
    }

    data = {
        "productId": product_id,
        "productSourceName": "default"
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(url, headers=headers, json=data)
            response.raise_for_status()
            result = response.json()

            if result.get('code') == '401' and result.get('msg') == '操作频繁':
                print(f"遇到频率限制，等待重试 (尝试 {attempt + 1}/{max_retries})")
                time.sleep(retry_delay * (attempt + 1))  # 指数退避
                continue

            return result

        except Exception as e:
            print(f"获取产品数据失败 (ID: {product_id}, 尝试 {attempt + 1}/{max_retries}): {str(e)}")
            print(f"响应内容: {response.text if 'response' in locals() else '无响应'}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay * (attempt + 1))

    return None

def convert_to_shopify_format(product_data):
    """将产品数据转换为Shopify格式"""
    try:
        # 获取基本商品信息
        detail = product_data.get('detail', {})
        if not detail:
            print("警告: 未找到detail信息")
            return None

        # 检查标题是否为空
        title = detail.get('title', '').strip()
        if not title:
            print("警告: 商品标题为空，跳过")
            return None

        print(f"\n处理产品: {title}")

        # 获取品牌信息
        brand_info = product_data.get('brandItemsModel', {})
        brand_name = brand_info.get('brandName', 'POIZON')

        # 固定的商品描述
        description = """
<div>
<p>1970s, but make it modern. Inspired by our most iconic '70s running shoes, the 327 is a bold blend of retro and right now unlike anything in your closet.</p>

<h3>Product Details</h3>
<h4>Features</h4>
<ul>
<li>Suede and nylon upper with animal-print accents</li>
<li>Oversized N logo adds retro-inspired style</li>
<li>Lace-up closure for a secure fit</li>
</ul>
</div>
"""

        # 获取商品图片
        images = []
        # 从imageModels获取主图
        image_models = product_data.get('imageModels', [])
        for img in image_models:
            if img.get('url'):
                images.append({
                    "src": img['url'],
                    "position": img.get('sort', len(images) + 1),
                    "alt": title
                })

        # 从detailImageList获取详情图
        detail_images = product_data.get('detailImageList', [])
        for img in detail_images:
            if img.get('url') and img.get('imgType') != 10002:  # 跳过特殊类型图片
                images.append({
                    "src": img['url'],
                    "position": img.get('sort', len(images) + 1),
                    "alt": title
                })

        # 固定的尺码范围
        fixed_sizes = [
            "4", "4.5", "5", "5.5", "6", "6.5", "7", "7.5", "8", "8.5", "9",
            "9.5", "10", "10.5", "11", "11.5", "12", "12.5", "13", "13.5",
            "14", "14.5", "15"
        ]

        # 创建基础商品信息
        shopify_product = {
            "title": title,
            "body_html": description,
            "vendor": brand_name,
            "product_type": detail.get("frontCategoryName", "").title(),
            "created_at": datetime.now().isoformat(),
            "handle": str(detail.get("spuId", "")),
            "published_at": datetime.now().isoformat(),
            "template_suffix": "",
            "published_scope": "web",
            "tags": "",
            "variants": [],
            "options": [{"name": "Size", "position": 1, "values": fixed_sizes}],
            "images": images,
            "status": "active"
        }

        # 为每个尺码创建变体
        for size in fixed_sizes:
            variant = {
                "sku": f"{detail.get('spuId')}_{size}",
                "price": 29.99,  # 固定售价
                "compare_at_price": 160.00,  # 固定原价
                "option1": size,
                "inventory_quantity": 999,  # 默认库存
                "inventory_policy": "deny",
                "inventory_management": "shopify",
                "requires_shipping": True,
                "taxable": True,
                "barcode": "",
                "weight": 0,
                "weight_unit": "kg",
                "fulfillment_service": "manual"
            }
            shopify_product["variants"].append(variant)

        # 添加其他元数据
        shopify_product.update({
            "SEO Title": title,
            "SEO Description": description,
            "Google Shopping / Google Product Category": product_data.get("googleProductCategory", ""),
            "Google Shopping / Gender": detail.get("gender", "unisex"),
            "Google Shopping / Age Group": detail.get("ageRange", "adult"),
            "Google Shopping / MPN": detail.get("spuId", ""),
            "Google Shopping / Condition": "New"
        })

        return shopify_product

    except Exception as e:
        print(f"转换数据时出错: {str(e)}")
        return None

def write_products_to_csv(products, writer, url):
    """将产品数据写入CSV文件"""
    for product in products:
        if not product:  # 跳过无效产品
            continue

        base_row = {
            'Handle': product['handle'],
            'Title': product['title'],
            'Body (HTML)': product['body_html'],
            'Vendor': product['vendor'],
            'Type': product['product_type'],
            'Tags': '',
            'Published': 'TRUE',
            'Option1 Name': 'Size',
            'Option2 Name': '',
            'Option3 Name': '',
            'Variant Grams': '0',
            'Variant Inventory Tracker': 'shopify',
            'Variant Inventory Policy': 'deny',
            'Variant Fulfillment Service': 'manual',
            'Variant Requires Shipping': 'TRUE',
            'Variant Taxable': 'TRUE',
            'Variant Barcode': '',
            'Image Src': '',
            'Image Position': '',
            'Image Alt Text': '',
            'Gift Card': 'FALSE',
            'SEO Title': product['title'],
            'SEO Description': '',
            'Google Shopping / Google Product Category': '',
            'Google Shopping / Gender': '',
            'Google Shopping / Age Group': '',
            'Google Shopping / MPN': '',
            'Google Shopping / AdWords Grouping': '',
            'Google Shopping / AdWords Labels': '',
            'Google Shopping / Condition': '',
            'Google Shopping / Custom Product': '',
            'Google Shopping / Custom Label 0': '',
            'Google Shopping / Custom Label 1': '',
            'Google Shopping / Custom Label 2': '',
            'Google Shopping / Custom Label 3': '',
            'Google Shopping / Custom Label 4': '',
            'Variant Weight Unit': 'g',
            'Variant Tax Code': '',
            'Cost per item': '',
            'Status': 'active',
            'Source URL': url
        }

        # 首先写入带有变体信息的行
        for variant in product['variants']:
            row = base_row.copy()
            row.update({
                'Option1 Value': variant['option1'],
                'Variant SKU': variant['sku'],
                'Variant Price': variant['price'],
                'Variant Compare At Price': variant.get('compare_at_price', ''),
                'Variant Inventory Qty': variant['inventory_quantity'],
            })

            # 如果有图片，写入第一张图片信息
            if product['images']:
                first_image = product['images'][0]
                row.update({
                    'Image Src': first_image['src'],
                    'Image Position': '1',
                    'Variant Image': first_image['src']
                })

            writer.writerow(row)

        # 然后写入额外的图片行（从第二张图片开始）
        if len(product['images']) > 1:
            for image in product['images'][1:]:
                image_row = {
                    'Handle': product['handle'],
                    'Title': '',
                    'Body (HTML)': '',
'Vendor': '',
                    'Type': '',
                    'Tags': '',
                    'Published': '',
                    'Option1 Name': '',
                    'Option1 Value': '',
                    'Option2 Name': '',
                    'Option2 Value': '',
                    'Option3 Name': '',
                    'Option3 Value': '',
                    'Variant SKU': '',
                    'Variant Grams': '',
                    'Variant Inventory Tracker': '',
                    'Variant Inventory Qty': '',
                    'Variant Inventory Policy': '',
                    'Variant Fulfillment Service': '',
                    'Variant Price': '',
                    'Variant Compare At Price': '',
                    'Variant Requires Shipping': '',
                    'Variant Taxable': '',
                    'Variant Barcode': '',
                    'Image Src': image['src'],
                    'Image Position': str(image['position']),
                    'Image Alt Text': '',
                    'Gift Card': '',
                    'SEO Title': '',
                    'SEO Description': '',
                    'Google Shopping / Google Product Category': '',
                    'Google Shopping / Gender': '',
                    'Google Shopping / Age Group': '',
                    'Google Shopping / MPN': '',
                    'Google Shopping / AdWords Grouping': '',
                    'Google Shopping / AdWords Labels': '',
                    'Google Shopping / Condition': '',
                    'Google Shopping / Custom Product': '',
                    'Google Shopping / Custom Label 0': '',
                    'Google Shopping / Custom Label 1': '',
                    'Google Shopping / Custom Label 2': '',
                    'Google Shopping / Custom Label 3': '',
                    'Google Shopping / Custom Label 4': '',
                    'Variant Image': '',
                    'Variant Weight Unit': '',
                    'Variant Tax Code': '',
                    'Cost per item': '',
                    'Status': '',
                    'Source URL': url
                }
                writer.writerow(image_row)

def get_urls_from_file(filename):
    """从文件中读取URL列表"""
    urls = []
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('@https://'):
                    url = line[1:]  # 去掉@符号
                    urls.append(url)
                    print(f"读取URL: {url}")
    except Exception as e:
        print(f"读取URL文件出错: {str(e)}")

    print(f"共读取到 {len(urls)} 个URL")
    return urls

def initialize_drivers(num_threads):
    """初始化WebDriver池"""
    drivers = []

    # 获取msedgedriver.exe的完整路径
    driver_path = os.path.abspath("msedgedriver.exe")
    print(f"当前工作目录: {os.getcwd()}")
    print(f"msedgedriver.exe 完整路径: {driver_path}")

    # 检查文件是否存在
    if not os.path.exists(driver_path):
        raise Exception(f"找不到msedgedriver.exe: {driver_path}")
    else:
        print("已找到 msedgedriver.exe")

    for i in range(num_threads):
        try:
            print(f"\n创建WebDriver实例 {i + 1}/{num_threads}")

            # 创建Options实例
            options = webdriver.EdgeOptions()

            # 基本配置
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-gpu')
            options.add_argument('--disable-extensions')
            options.add_argument('--disable-popup-blocking')
            options.add_argument('--disable-blink-features=AutomationControlled')
            options.add_argument('--disable-infobars')
            options.add_argument('--disable-notifications')
            options.add_argument('--ignore-certificate-errors')
            options.add_argument('--ignore-ssl-errors')
            options.add_argument('--no-first-run')
            options.add_argument('--no-default-browser-check')
            options.add_argument('--window-size=1920,1080')
            options.add_argument('--lang=zh-CN')

            # 禁用日志和自动化提示
            options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation'])
            options.add_experimental_option('useAutomationExtension', False)

            # 设置其他首选项
            prefs = {
                'profile.default_content_setting_values': {
                    'notifications': 2,
                    'images': 1,
                    'javascript': 1,
                    'cookies': 1,
                    'plugins': 1,
                    'popups': 2
                }
            }
            options.add_experimental_option('prefs', prefs)

            # 创建Service实例
            service = Service(executable_path=driver_path)

            # 创建WebDriver实例
            driver = webdriver.Edge(service=service, options=options)

            # 设置页面加载超时
            driver.set_page_load_timeout(30)
            driver.set_script_timeout(30)

            drivers.append(driver)
            print(f"成功创建WebDriver实例 {i + 1}")

        except Exception as e:
            print(f"创建WebDriver实例 {i + 1} 失败: {str(e)}")
            # 尝试清理失败的实例
            try:
                if 'driver' in locals():
                    driver.quit()
            except:
                pass

    if not drivers:
        raise Exception("无法创建任何WebDriver实例")

    return drivers

def cleanup_drivers(drivers):
    """清理WebDriver实例"""
    for driver in drivers:
        try:
            driver.quit()
        except:
            pass

def get_safe_filename(url):
    """从URL生成安全的文件名"""
    # 获取URL的最后一部分
    base_name = url.split('/')[-1]
    # 替换非单词字符为下划线
    safe_name = re.sub(r'[^\w]', '_', base_name)
    return safe_name

def save_debug_info(url, data, prefix='debug'):
    """保存调试信息到文件"""
    safe_name = get_safe_filename(url)
    filename = f"{prefix}_{safe_name}.json"

    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"调试信息已保存到: {filename}")
    except Exception as e:
        print(f"保存调试信息失败: {str(e)}")

def get_poizon_products():
    num_threads = 3  # 使用3个线程
    batch_size = 3   # 每批处理3个URL

    # 使用固定的CSV文件名
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f'poizon_products_{timestamp}.csv'
    fieldnames = [
        "Handle", "Title", "Body (HTML)", "Vendor", "Type", "Tags", "Published",
        "Option1 Name", "Option1 Value", "Option2 Name", "Option2 Value",
        "Option3 Name", "Option3 Value", "Variant SKU", "Variant Grams",
        "Variant Inventory Tracker", "Variant Inventory Qty", "Variant Inventory Policy",
        "Variant Fulfillment Service", "Variant Price", "Variant Compare At Price",
        "Variant Requires Shipping", "Variant Taxable", "Variant Barcode",
        "Image Src", "Image Position", "Image Alt Text", "Gift Card",
        "SEO Title", "SEO Description", "Google Shopping / Google Product Category",
        "Google Shopping / Gender", "Google Shopping / Age Group",
        "Google Shopping / MPN", "Google Shopping / AdWords Grouping",
        "Google Shopping / AdWords Labels", "Google Shopping / Condition",
        "Google Shopping / Custom Product", "Google Shopping / Custom Label 0",
        "Google Shopping / Custom Label 1", "Google Shopping / Custom Label 2",
        "Google Shopping / Custom Label 3", "Google Shopping / Custom Label 4",
        "Variant Image", "Variant Weight Unit", "Variant Tax Code",
        "Cost per item", "Status", "Source URL"
    ]

    print("初始化WebDriver...")
    # 初始化WebDriver池
    drivers = initialize_drivers(num_threads)
    driver_queue = Queue()
    for driver in drivers:
        driver_queue.put(driver)

    try:
        print(f"\n开始读取URL文件...")
        urls = get_urls_from_file('poizon_urls_20250422_120341.txt')
        print(f"总共需要处理 {len(urls)} 个URL")

        # 检查CSV文件是否存在
        file_exists = os.path.exists(csv_filename)

        # 打开CSV文件，使用追加模式，并设置为行缓冲
        csvfile = open(csv_filename, 'a' if file_exists else 'w', newline='', encoding='utf-8-sig', buffering=1)
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # 如果文件不存在，写入表头
        if not file_exists:
            writer.writeheader()
            csvfile.flush()  # 立即写入表头
            print(f"创建新的CSV文件: {csv_filename}")
        else:
            print(f"追加到现有CSV文件: {csv_filename}")

        # 创建线程锁
        csv_lock = threading.Lock()

        # 创建线程池
        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # 提交所有任务
            futures = []
            for url in urls:
                future = executor.submit(process_url, url, driver_queue, writer, csv_lock, csvfile)
                futures.append(future)

            # 等待所有任务完成并显示进度
            completed = 0
            total = len(urls)
            for future in as_completed(futures):
                completed += 1
                try:
                    future.result()  # 获取任务结果（如果有异常会在这里抛出）
                except Exception as e:
                    print(f"任务执行失败: {str(e)}")
                print(f"进度: {completed}/{total} ({(completed/total*100):.1f}%)")

                # 每完成一批就刷新文件
                if completed % batch_size == 0:
                    csvfile.flush()

    except Exception as error:
        print(f"发生错误: {str(error)}")
    finally:
        # 清理资源
        print("\n清理资源...")
        if 'csvfile' in locals():
            csvfile.close()
        cleanup_drivers(drivers)
        print(f"数据已保存到 {csv_filename}")

def wait_for_next_data(driver, timeout=30):
    """等待__NEXT_DATA__加载完成"""
    start_time = time.time()
    while time.time() - start_time < timeout:
        try:
            next_data = driver.execute_script("return window.__NEXT_DATA__;")
            if next_data:
                return next_data
        except:
            pass
        time.sleep(1)
return None

def process_url(url, driver_queue, csv_writer, csv_lock, csvfile):
    """处理单个URL的函数"""
    driver = None
    try:
        driver = driver_queue.get()
        if not driver:  # 如果获取到的是None，创建新的driver
            print(f"\n创建新的WebDriver实例")
            options = webdriver.EdgeOptions()

            # 基本配置
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-gpu')
            options.add_argument('--disable-extensions')
            options.add_argument('--disable-popup-blocking')
            options.add_argument('--disable-blink-features=AutomationControlled')
            options.add_argument('--disable-infobars')
            options.add_argument('--disable-notifications')
            options.add_argument('--ignore-certificate-errors')
            options.add_argument('--ignore-ssl-errors')
            options.add_argument('--no-first-run')
            options.add_argument('--no-default-browser-check')
            options.add_argument('--window-size=1920,1080')
            options.add_argument('--lang=zh-CN')

            # 禁用日志和自动化提示
            options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation'])
            options.add_experimental_option('useAutomationExtension', False)

            # 设置其他首选项
            prefs = {
                'profile.default_content_setting_values': {
                    'notifications': 2,
                    'images': 1,
                    'javascript': 1,
                    'cookies': 1,
                    'plugins': 1,
                    'popups': 2
                }
            }
            options.add_experimental_option('prefs', prefs)

            # 创建新的driver实例
            service = Service(executable_path=os.path.abspath("msedgedriver.exe"))
            driver = webdriver.Edge(service=service, options=options)
            driver.set_page_load_timeout(30)
            driver.set_script_timeout(30)

        print(f"\n正在处理URL: {url}")

        # 添加随机延时
        time.sleep(random.uniform(1, 3))

        print(f"正在打开页面...")
        driver.get(url)

        # 等待页面加载完成
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # 等待页面完全加载
        time.sleep(5)  # 给页面一些额外的加载时间

        # 等待__NEXT_DATA__加载
        print("等待数据加载...")
        next_data = wait_for_next_data(driver)

        if next_data:
            print("成功获取 __NEXT_DATA__ 对象")
            try:
                # 如果返回的是字符串，则解析为JSON
                if isinstance(next_data, str):
                    next_data = json.loads(next_data)

                # 获取商品详情数据
                goods_detail = next_data.get('props', {}).get('pageProps', {}).get('goodsDetail')

                if goods_detail:
                    print("成功获取商品详情数据")
                    shopify_product = convert_to_shopify_format(goods_detail)
                    if shopify_product:
                        # 立即写入CSV文件
                        with csv_lock:
                            write_products_to_csv([shopify_product], csv_writer, url)
                            csvfile.flush()  # 使用传入的csvfile对象进行刷新
                        print(f"成功处理并保存产品: {url}")
                        # 成功获取数据，将driver放回队列
                        driver_queue.put(driver)
                        return
                    else:
                        print(f"无法转换产品数据: {url}")
                else:
                    print(f"未找到商品详情数据: {url}")
            except Exception as e:
                print(f"解析数据时出错: {str(e)}")
        else:
            print("尝试从页面源码中提取数据...")
            try:
                # 刷新页面重试一次
                driver.refresh()
                time.sleep(5)  # 等待刷新后的页面加载

                # 获取页面源码
                page_source = driver.page_source

                # 查找 __NEXT_DATA__ 脚本标签
                match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', page_source)
                if match:
                    # 提取JSON字符串
                    json_str = match.group(1)
                    # 解析JSON数据
                    json_data = json.loads(json_str)

                    # 获取商品详情数据
                    goods_detail = json_data.get('props', {}).get('pageProps', {}).get('goodsDetail')

                    if goods_detail:
                        print("从页面源码成功获取商品详情数据")
                        shopify_product = convert_to_shopify_format(goods_detail)
                        if shopify_product:
                            # 立即写入CSV文件
                            with csv_lock:
                                write_products_to_csv([shopify_product], csv_writer, url)
                                csvfile.flush()  # 使用传入的csvfile对象进行刷新
                            print(f"成功处理并保存产品: {url}")
                            # 成功获取数据，将driver放回队列
                            driver_queue.put(driver)
                            return
                        else:
                            print(f"无法转换产品数据: {url}")
                    else:
                        print(f"未找到商品详情数据: {url}")
                else:
                    print(f"未找到 __NEXT_DATA__ 标签: {url}")
            except Exception as e:
                print(f"从页面源码提取数据时出错: {str(e)}")

            print(f"无法获取产品数据: {url}")

        # 如果到这里，说明获取数据失败
        driver_queue.put(None)

    except Exception as e:
        print(f"处理URL时出错 {url}: {str(e)}")
        if driver:
            try:
                driver.quit()
            except:
                pass
        driver_queue.put(None)

if __name__ == "__main__":
    print("开始运行爬虫程序...")
    get_poizon_products()

搜索结果

poizon_scraper.py获取分类数据

poizon.py