🛠️shopify采集脚本 - nextpl
🏀 nextpl

import json
import csv
import requests
import time
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
import threading
import unicodedata

# Shopify 标准 CSV 格式字段
fields = [
    "Handle", "Title", "Body (HTML)", "Vendor", "Type", "Tags", "Published",
    "Option1 Name", "Option1 Value", "Option2 Name", "Option2 Value",
    "Option3 Name", "Option3 Value", "Variant SKU", "Variant Grams",
    "Variant Inventory Tracker", "Variant Inventory Qty", "Variant Inventory Policy",
    "Variant Fulfillment Service", "Variant Price", "Variant Compare At Price",
    "Variant Requires Shipping", "Variant Taxable", "Variant Barcode",
    "Image Src", "Image Position", "Image Alt Text", "Gift Card",
    "SEO Title", "SEO Description", "Google Shopping / Google Product Category",
    "Google Shopping / Gender", "Google Shopping / Age Group",
    "Google Shopping / MPN", "Google Shopping / AdWords Grouping",
    "Google Shopping / AdWords Labels", "Google Shopping / Condition",
    "Google Shopping / Custom Product", "Google Shopping / Custom Label 0",
    "Google Shopping / Custom Label 1", "Google Shopping / Custom Label 2",
    "Google Shopping / Custom Label 3", "Google Shopping / Custom Label 4",
    "Variant Image", "Variant Weight Unit", "Variant Tax Code",
    "Cost per item", "Status"
]

# 创建一个线程安全的写入队列
write_queue = Queue()
# 创建一个线程安全的计数器
counter_lock = threading.Lock()
processed_count = 0
total_urls = 0

def normalize_text(text):
    """处理特殊字符和乱码"""
    if not text:
        return ""
    # 将特殊字符转换为基本拉丁字符
    normalized = unicodedata.normalize('NFKD', text)
    # 移除组合字符
    normalized = ''.join(c for c in normalized if not unicodedata.combining(c))
    # 特殊字符映射
    char_map = {
        'ł': 'l', 'Ł': 'L',
        'ą': 'a', 'Ą': 'A',
        'ć': 'c', 'Ć': 'C',
        'ę': 'e', 'Ę': 'E',
        'ń': 'n', 'Ń': 'N',
        'ó': 'o', 'Ó': 'O',
        'ś': 's', 'Ś': 'S',
        'ź': 'z', 'Ź': 'Z',
        'ż': 'z', 'Ż': 'Z',
        '–': '-', '—': '-',
        '"': '"', '"': '"',
        ''': "'", ''': "'",
    }
    for old, new in char_map.items():
        normalized = normalized.replace(old, new)
    return normalized

def get_json_data(url):
    """从URL获取JSON数据"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error fetching data from {url}: {str(e)}")
        return None

def process_product(url):
    """处理单个商品数据"""
    global processed_count

    data = get_json_data(url)
    if not data:
        return

    try:
        # 获取产品数据
        product = data["pageProps"]["dehydratedState"]["queries"][1]["state"]["data"]

        # 商品基础信息
        handle = product.get("itemNumber", "")
        title = normalize_text(product.get("title", ""))
        body_html = normalize_text(product.get("itemDescription", {}).get("composition", ""))
        vendor = normalize_text(product.get("brand", ""))
        product_type = normalize_text(product.get("category", ""))
        tags = normalize_text(product.get("collection", ""))
        published = "TRUE" if product.get("status") == "success" else "FALSE"
        seo_title = title
        seo_description = body_html

        # 获取颜色信息
        colour = normalize_text(product.get("colour", ""))

        # 商品图片信息
        images = [f"https://www.next.pl{img['imageUrl']}" for img in product.get("itemMedia", [])]

        # 遍历变体 (SKUs)
        options = product.get("options", {}).get("options", [])
        rows = []
        for idx, variant in enumerate(options):
            option_value = normalize_text(variant["name"])
            variant_sku = f"{handle}-{variant['value']}"
            variant_price = product.get("priceData", {}).get("price", {}).get("minPrice", "")

            # 构造每一行数据
            row = {
                "Handle": handle,
                "Title": title if idx == 0 else "",
                "Body (HTML)": body_html if idx == 0 else "",
                "Vendor": vendor if idx == 0 else "",
                "Type": product_type if idx == 0 else "",
                "Tags": tags if idx == 0 else "",
                "Published": published if idx == 0 else "",
                "Option1 Name": "Size",
                "Option1 Value": option_value,
                "Option2 Name": "Color",
                "Option2 Value": colour,
                "Variant SKU": variant_sku,
                "Variant Grams": "",
                "Variant Inventory Tracker": "shopify",
                "Variant Inventory Qty": 100,  # 默认库存量
                "Variant Inventory Policy": "deny",
                "Variant Fulfillment Service": "manual",
                "Variant Price": variant_price,
                "Variant Compare At Price": "",
                "Variant Requires Shipping": "TRUE",
                "Variant Taxable": "TRUE",
                "Variant Barcode": "",
                "Image Src": images[idx] if idx < len(images) else images[0],  # 每个变体分配一张图片
                "Image Position": idx + 1,
                "Image Alt Text": title,
                "Gift Card": "FALSE",
                "SEO Title": seo_title if idx == 0 else "",
                "SEO Description": seo_description if idx == 0 else "",
                "Google Shopping / Google Product Category": "",
                "Google Shopping / Gender": "Unisex",
                "Google Shopping / Age Group": "Adult",
                "Google Shopping / MPN": handle,
                "Google Shopping / AdWords Grouping": "",
                "Google Shopping / AdWords Labels": "",
                "Google Shopping / Condition": "New",
                "Google Shopping / Custom Product": "FALSE",
                "Google Shopping / Custom Label 0": "",
                "Google Shopping / Custom Label 1": "",
                "Google Shopping / Custom Label 2": "",
                "Google Shopping / Custom Label 3": "",
                "Google Shopping / Custom Label 4": "",
                "Variant Image": "",
                "Variant Weight Unit": "kg",
                "Variant Tax Code": "",
                "Cost per item": "",
                "Status": "active"
            }
            rows.append(row)

        # 将处理好的行数据放入写入队列
        for row in rows:
            write_queue.put(row)

        # 更新进度
with counter_lock:
            global processed_count
            processed_count += 1
            print(f"Progress: {processed_count}/{total_urls} - Processed: {url}")

    except KeyError as e:
        print(f"Missing key in data for {url}: {e}")
    except Exception as e:
        print(f"Error processing {url}: {e}")

def writer_thread(output_file):
    """CSV写入线程"""
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()

        while True:
            row = write_queue.get()
            if row is None:  # 结束信号
                break
            writer.writerow(row)
            csvfile.flush()  # 确保立即写入磁盘
            write_queue.task_done()

def main():
    output_file = "shopify_products.csv"

    # 读取链接文件
    with open("1.txt", "r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]

    global total_urls
    total_urls = len(urls)
    print(f"Total URLs to process: {total_urls}")

    # 启动CSV写入线程
    writer = threading.Thread(target=writer_thread, args=(output_file,))
    writer.start()

    # 使用线程池处理URLs
    with ThreadPoolExecutor(max_workers=10) as executor:
        # 提交所有任务
        futures = [executor.submit(process_product, url) for url in urls]

        # 等待所有任务完成
        for future in as_completed(futures):
            try:
                future.result()  # 获取任务结果（如果有异常会在这里抛出）
            except Exception as e:
                print(f"Task failed: {e}")

    # 发送结束信号给写入线程
    write_queue.put(None)
    # 等待写入线程完成
    writer.join()

    print(f"All products data has been written to {output_file}")
    print(f"Total processed: {processed_count}/{total_urls}")

if __name__ == "__main__":
    main()
搜索结果