🛠️shopify采集脚本 - stoneisland

💡 stoneisland

stoneisland-getpid

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
import random
import threading
from queue import Queue
import os

# 输出文件
output_file = "stoneisland-pid.txt"

# 基础 URL
base_url = "https://www.stoneisland.com/on/demandware.store/Sites-StoneEU-Site/it_IT/SearchApi-Search"

# 线程数
NUM_THREADS = 10

# 创建一个队列用于存储待处理的start值
task_queue = Queue()

# 创建一个集合用于存储所有产品ID，使用threading.Lock确保线程安全
product_ids = set()
product_ids_lock = threading.Lock()

# 创建一个计数器记录总产品数，使用threading.Lock确保线程安全
total_products = 0
total_products_lock = threading.Lock()

# 创建文件锁
file_lock = threading.Lock()

def worker(thread_id):
    global total_products

    # 配置 Edge WebDriver
    service = Service('./msedgedriver.exe')
    options = webdriver.EdgeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--no-sandbox')

    try:
        # 初始化浏览器
        driver = webdriver.Edge(service=service, options=options)
        wait = WebDriverWait(driver, 30)

        while True:
            try:
                # 从队列获取start值
                start = task_queue.get_nowait()
            except:
                break

            try:
                # 构建URL
                url = f"{base_url}?cgid=salesstoneisland-viewallsales&srule=Out%20of%20Stock_GO_Live_26.07&sz=20&start={start}"
                print(f"\n线程 {thread_id} 正在访问: {url}")

                driver.get(url)
                time.sleep(random.uniform(2, 4))

                try:
                    pre_element = wait.until(
                        EC.presence_of_element_located((By.TAG_NAME, "pre"))
                    )

                    # 获取JSON数据
                    json_text = pre_element.text
                    data = json.loads(json_text)

                    # 从data中获取products数组
                    products = data.get("data", {}).get("products", [])

                    if not products:
                        print(f"线程 {thread_id}: 没有更多产品，结束获取 (start={start})")
                        task_queue.task_done()
                        continue

                    # 遍历每个产品
                    for product in products:
                        product_id = product.get("id")
                        if product_id:
                            with product_ids_lock:
                                if product_id not in product_ids:
                                    product_ids.add(product_id)
                                    with file_lock:
                                        with open(output_file, "a", encoding="utf-8") as file:
                                            file.write(f"{product_id}\n")
                                    with total_products_lock:
                                        global total_products
                                        total_products += 1
                                        print(f"线程 {thread_id} 找到产品ID: {product_id}, 总计: {total_products}")

                    task_queue.task_done()

                except Exception as e:
                    print(f"线程 {thread_id} 等待页面加载超时或解析数据出错: {e}")
                    time.sleep(random.uniform(5, 8))
                    task_queue.put(start)  # 将失败的任务放回队列
                    task_queue.task_done()
                    continue

            except Exception as e:
                print(f"线程 {thread_id} 处理页面时出错: {e}")
                time.sleep(random.uniform(5, 8))
                task_queue.put(start)  # 将失败的任务放回队列
                task_queue.task_done()
                continue

    except Exception as e:
        print(f"线程 {thread_id} 运行过程中发生错误: {e}")

    finally:
        try:
            driver.quit()
        except:
            pass

def main():
    # 清空输出文件
    with open(output_file, "w", encoding="utf-8") as file:
        file.write("")

    # 初始化任务队列
    for start in range(0, 1000, 20):  # 假设最多1000个商品
        task_queue.put(start)

    # 创建并启动线程
    threads = []
    for i in range(NUM_THREADS):
        thread = threading.Thread(target=worker, args=(i+1,))
        threads.append(thread)
        thread.start()

    # 等待所有线程完成
    for thread in threads:
        thread.join()

    print(f"\n抓取完成，共获取 {total_products} 个唯一产品ID")
    print(f"结果已保存到 {output_file}")

if __name__ == "__main__":
    main()

stoneisland

import json
import csv
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
import threading
import unicodedata
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

# Shopify标准CSV格式字段
fields = [
    "Handle", "Title", "Body (HTML)", "Vendor", "Type", "Tags", "Published",
    "Option1 Name", "Option1 Value", "Option2 Name", "Option2 Value",
    "Option3 Name", "Option3 Value", "Variant SKU", "Variant Grams",
    "Variant Inventory Tracker", "Variant Inventory Qty", "Variant Inventory Policy",
    "Variant Fulfillment Service", "Variant Price", "Variant Compare At Price",
    "Variant Requires Shipping", "Variant Taxable", "Variant Barcode",
    "Image Src", "Image Position", "Image Alt Text", "Gift Card",
    "SEO Title", "SEO Description", "Google Shopping / Google Product Category",
    "Google Shopping / Gender", "Google Shopping / Age Group",
    "Google Shopping / MPN", "Google Shopping / AdWords Grouping",
    "Google Shopping / AdWords Labels", "Google Shopping / Condition",
    "Google Shopping / Custom Product", "Google Shopping / Custom Label 0",
    "Google Shopping / Custom Label 1", "Google Shopping / Custom Label 2",
    "Google Shopping / Custom Label 3", "Google Shopping / Custom Label 4",
    "Variant Image", "Variant Weight Unit", "Variant Tax Code",
    "Cost per item", "Status"
]

# 创建线程安全的写入队列和计数器
write_queue = Queue()
counter_lock = threading.Lock()
processed_count = 0
total_urls = 0

def normalize_text(text):
    """处理特殊字符和乱码"""
    if not text:
        return ""
    # 将特殊字符转换为基本拉丁字符
    normalized = unicodedata.normalize('NFKD', text)
    # 移除组合字符
    normalized = ''.join(c for c in normalized if not unicodedata.combining(c))
    # 特殊字符映射
    char_map = {
        # 意大利语特殊字符
        'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a',
        'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e',
        'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i',
        'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o',
        'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u',
        'ý': 'y', 'ÿ': 'y',
        'ñ': 'n',
        'ç': 'c',
    }

    # 应用字符映射
    for old, new in char_map.items():
        normalized = normalized.replace(old, new)

    # 移除任何剩余的非ASCII字符
    normalized = ''.join(c for c in normalized if ord(c) < 128)

    return normalized.strip()

def get_json_data(pid):
    """从Stone Island API获取JSON数据"""
    max_retries = 5
    retry_delay = 3

    for attempt in range(max_retries):
        driver = None
        try:
            print(f"\n{'='*50}")
            print(f"Processing PID: {pid}")
            print(f"Attempt {attempt + 1}/{max_retries}")

            # 设置Edge WebDriver
            service = Service('msedgedriver.exe')
            options = webdriver.EdgeOptions()
            options.add_argument('--headless')  # 无头模式
            options.add_argument('--disable-gpu')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--window-size=1920,1080')
            options.add_argument('--disable-blink-features=AutomationControlled')

            # 添加自定义请求头
            options.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

            driver = webdriver.Edge(service=service, options=options)

            # 直接访问API URL
            api_url = f"https://www.stoneisland.com/on/demandware.store/Sites-StoneEU-Site/it_IT/Api-ProductTiles?pid={pid}"
            print(f"Accessing API URL: {api_url}")

            driver.get(api_url)

            # 等待页面加载完成
            wait = WebDriverWait(driver, 15)
            wait.until(lambda d: d.execute_script("return document.readyState") == "complete")

            # 获取页面内容
            json_text = None
            try:
                # 等待pre标签出现
                pre_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "pre")))
                json_text = pre_element.text
                print("Successfully found pre element with JSON data")
            except Exception as e:
                print(f"Failed to find pre element: {str(e)}")
                # 如果找不到pre标签，尝试获取整个页面源码
                json_text = driver.page_source
                # 提取JSON部分
                import re
                match = re.search(r'<pre.*?>(.*?)</pre>', json_text, re.DOTALL)
                if match:
                    json_text = match.group(1).strip()
                    print("Extracted JSON from page source")

            if not json_text:
                print("No JSON data found in the page")
                if attempt < max_retries - 1:
                    continue
                return None

            # 清理JSON文本
json_text = json_text.strip()
            if json_text.startswith('<pre>'):
                json_text = json_text[5:]
            if json_text.endswith('</pre>'):
                json_text = json_text[:-6]

            # 验证JSON数据
            try:
                data = json.loads(json_text)
            except json.JSONDecodeError as e:
                print(f"JSON decode error: {str(e)}")
                print(f"JSON text (first 500 chars): {json_text[:500]}")
                if attempt < max_retries - 1:
                    continue
                raise

            # 验证数据完整性
            if not data:
                print("Empty JSON data")
                if attempt < max_retries - 1:
                    continue
                return None

            if "products" not in data:
                print("No 'products' key in JSON data")
                print(f"Available keys: {list(data.keys())}")
                if attempt < max_retries - 1:
                    continue
                return None

            if not data["products"]:
                print("Empty products list")
                if attempt < max_retries - 1:
                    continue
                return None

            # 验证产品数据完整性
            product = data["products"][0]
            required_fields = ["id", "productName", "variationAttributes", "price", "imgs"]
            missing_fields = [field for field in required_fields if field not in product]

            if missing_fields:
                print(f"Missing required fields: {missing_fields}")
                print(f"Available fields: {list(product.keys())}")
                if attempt < max_retries - 1:
                    continue
                return None

            print(f"Successfully retrieved complete data for {pid}")
            return data

        except TimeoutException as e:
            print(f"Timeout error for {pid}: {str(e)}")
        except WebDriverException as e:
            print(f"WebDriver error for {pid}: {str(e)}")
        except Exception as e:
            print(f"Unexpected error for {pid}: {str(e)}")
            import traceback
            print(f"Traceback: {traceback.format_exc()}")

        finally:
            # 确保关闭浏览器
            if driver:
                try:
                    driver.quit()
                except:
                    pass

        if attempt < max_retries - 1:
            print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    print(f"Failed to fetch data after {max_retries} attempts for {pid}")
    return None

def process_product(pid):
    """处理单个商品数据"""
    global processed_count
    success = False

    print(f"\nStarting to process product: {pid}")
    data = get_json_data(pid)

    if not data or "products" not in data or not data["products"]:
        print(f"Skipping {pid} due to invalid data")
        return success

    try:
        # 获取产品数据
        product = data["products"][0]
        print(f"Processing product data...")
        print(f"Product ID: {product.get('id', '')}")
        print(f"Product Name: {product.get('productName', '')}")

        # 商品基础信息
        handle = pid  # 使用输入的pid作为handle
        title = normalize_text(product.get("productName", ""))
        description = normalize_text(product.get("shortDescription", ""))
        vendor = "Stone Island"
        product_type = product.get("analyticsAttributes", {}).get("item_category2", "")
        published = "TRUE"

        # 获取变体属性
        variation_attributes = product.get("variationAttributes", [])
        color_data = next((attr for attr in variation_attributes if attr["attributeId"] == "color"), None)
        size_data = next((attr for attr in variation_attributes if attr["attributeId"] == "size"), None)

        # 获取颜色信息
        colors = []
        if color_data:
            for color in color_data.get("values", []):
                if color.get("selectable", False):
                    colors.append({
                        "name": normalize_text(color_data["displayName"]),
                        "value": normalize_text(color["displayValue"]),
                        "id": color["id"]
                    })

        # 获取尺码信息
        sizes = []
        if size_data:
            for size in size_data.get("values", []):
                if size.get("selectable", False):
                    sizes.append({
                        "name": normalize_text(size_data["displayName"]),
                        "value": normalize_text(size["displayValue"]),
                        "stock": size.get("ATS", 0)
                    })

        # 价格信息
        price_data = product.get("price", {})
        sales_data = price_data.get("sales", {})
        sale_price = sales_data.get("value", "")

        # 图片信息
        images = []
        imgs_data = product.get("imgs", {})
        if imgs_data and "urls" in imgs_data:
            for idx, img_url in enumerate(imgs_data["urls"]):
                images.append({
                    "src": img_url,
                    "alt": imgs_data.get("alt", title)
                })

        # 生成Shopify格式的数据
        rows = []
        first_row = True

        # 为每个颜色和尺码组合创建变体
        for color in colors:
            for size in sizes:
                variant_sku = f"{handle}-{color['id']}-{size['value']}"

                # 构造每一行数据
                row = {
                    "Handle": handle,
                    "Title": title if first_row else "",
                    "Body (HTML)": description if first_row else "",
                    "Vendor": vendor if first_row else "",
                    "Type": product_type if first_row else "",
                    "Tags": "",
                    "Published": published if first_row else "",
                    "Option1 Name": "Color" if first_row else "",
                    "Option1 Value": color['value'],
                    "Option2 Name": "Size" if first_row else "",
                    "Option2 Value": size['value'],
                    "Variant SKU": variant_sku,
                    "Variant Grams": "",
                    "Variant Inventory Tracker": "shopify",
                    "Variant Inventory Qty": size["stock"],
                    "Variant Inventory Policy": "deny",
                    "Variant Fulfillment Service": "manual",
                    "Variant Price": sale_price,
                    "Variant Compare At Price": "",
                    "Variant Requires Shipping": "TRUE",
                    "Variant Taxable": "TRUE",
                    "Variant Barcode": "",
                    "Image Src": images[0]["src"] if images and first_row else "",
                    "Image Position": 1 if first_row else "",
                    "Image Alt Text": images[0]["alt"] if images and first_row else "",
                    "Gift Card": "FALSE" if first_row else "",
                    "SEO Title": title if first_row else "",
                    "SEO Description": description if first_row else "",
                    "Google Shopping / Google Product Category": "",
                    "Google Shopping / Gender": product.get("analyticsAttributes", {}).get("item_category4", "") if first_row else "",
                    "Google Shopping / Age Group": "Adult" if first_row else "",
                    "Google Shopping / MPN": handle if first_row else "",
                    "Google Shopping / AdWords Grouping": "",
                    "Google Shopping / AdWords Labels": "",
                    "Google Shopping / Condition": "New" if first_row else "",
                    "Google Shopping / Custom Product": "FALSE" if first_row else "",
                    "Google Shopping / Custom Label 0": "",
                    "Google Shopping / Custom Label 1": "",
                    "Google Shopping / Custom Label 2": "",
                    "Google Shopping / Custom Label 3": "",
                    "Google Shopping / Custom Label 4": "",
                    "Variant Image": "",
                    "Variant Weight Unit": "kg" if first_row else "",
                    "Variant Tax Code": "",
                    "Cost per item": "",
                    "Status": "active" if first_row else ""
                }
                rows.append(row)
                first_row = False

        # 添加额外的图片行
        if images and len(images) > 1:
            for idx, img in enumerate(images[1:], start=2):
                image_row = {field: "" for field in fields}
                image_row["Handle"] = handle
                image_row["Image Src"] = img["src"]
                image_row["Image Position"] = idx
                image_row["Image Alt Text"] = img["alt"]
                image_row["Status"] = "active"
                rows.append(image_row)

        # 将处理好的行数据放入写入队列
        for row in rows:
            write_queue.put(row)

        # 在所有行写入后，发送PID用于删除
        write_queue.put(pid)

        # 更新进度
        with counter_lock:
            global processed_count
            processed_count += 1
            print(f"Progress: {processed_count}/{total_urls} - Processed: {pid}")

        success = True

    except Exception as e:
        print(f"Error processing {pid}: {e}")
        import traceback
        print(f"Traceback: {traceback.format_exc()}")

    return success

def remove_successful_pid(pid):
    """从PID文件中删除成功采集的PID"""
    try:
        with open("stoneisland-pid.txt", "r", encoding="utf-8") as f:
            pids = f.readlines()

        # 过滤掉成功的PID
        pids = [p.strip() for p in pids if p.strip() != pid]

        # 写回文件
        with open("stoneisland-pid.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(pids))

        print(f"Successfully removed PID {pid} from file")
    except Exception as e:
        print(f"Error removing PID {pid} from file: {e}")

def writer_thread(output_file):
    """CSV写入线程"""
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()

        while True:
            item = write_queue.get()
            if item is None:  # 结束信号
                break
            if isinstance(item, dict):  # 数据行
                writer.writerow(item)
                csvfile.flush()  # 确保立即写入磁盘
            elif isinstance(item, str):  # PID信号
                try:
                    remove_successful_pid(item)
                except Exception as e:
                    print(f"Error removing PID {item}: {e}")
            write_queue.task_done()

def main():
    output_file = "stoneisland_products.csv"

    # 读取PID文件
    with open("stoneisland-pid.txt", "r", encoding="utf-8") as f:
        pids = [line.strip() for line in f if line.strip()]

    global total_urls
    total_urls = len(pids)
    print(f"Total PIDs to process: {total_urls}")

    # 启动CSV写入线程
    writer = threading.Thread(target=writer_thread, args=(output_file,))
    writer.start()

    # 使用10线程的线程池处理PIDs
    with ThreadPoolExecutor(max_workers=10) as executor:
        # 提交所有任务并获取结果
        future_to_pid = {executor.submit(process_product, pid): pid for pid in pids}

        # 等待所有任务完成并处理结果
        for future in as_completed(future_to_pid):
            pid = future_to_pid[future]
            try:
                success = future.result()
            except Exception as e:
                print(f"Task failed for PID {pid}: {e}")

    # 发送结束信号给写入线程
    write_queue.put(None)
    # 等待写入线程完成
    writer.join()

    print(f"All products data has been written to {output_file}")
    print(f"Total processed: {processed_count}/{total_urls}")

if __name__ == "__main__":
    main()

搜索结果

stoneisland-getpid

stoneisland