🛠️shopify采集脚本 - hugoboss

🌏 hugoboss

hugoboss-getpid copy

import time
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 输出文件
output_file = "hugoboss-pid.txt"

# 基础 URL 和页数配置
base_url = "https://www.hugoboss.com/es/hombre-ropa/"
page_size = 36
total_pages = 93

# 初始化Edge驱动
service = Service('./msedgedriver')
options = webdriver.EdgeOptions()
# options.add_argument('--headless')  # 如果需要无界面运行，取消这行注释
driver = webdriver.Edge(service=service, options=options)
wait = WebDriverWait(driver, 10)  # 设置最大等待时间为10秒

# 清空或初始化输出文件
with open(output_file, "w") as file:
    file.write("")

try:
    # 遍历所有页面
    for page in range(total_pages):
        start = page * page_size
        url = f"{base_url}?start={start}&sz={page_size}"
        print(f"正在处理页面：{url}")

        try:
            # 加载页面
            driver.get(url)
            time.sleep(3)  # 等待页面加载

            # 等待产品元素出现
            wait.until(EC.presence_of_element_located((By.XPATH, '//article')))

            # 提取所有data-pid
            data_pids = []
            elements = driver.find_elements(By.XPATH, '//dialog//div/a[@data-pid]')
            for element in elements:
                pid = element.get_attribute('data-pid')
                if pid:
                    data_pids.append(pid)

            # 去重处理
            data_pids = list(set(data_pids))

            # 如果这一页的ID数量异常少，输出警告
            if len(data_pids) < 50:
                print(f"警告：页面 {page + 1}/{total_pages} 获取的ID数量不足50个，仅找到 {len(data_pids)} 个")
                # 保存页面源码以供调试
                with open(f"debug_page_{page + 1}.html", "w", encoding="utf-8") as debug_file:
                    debug_file.write(driver.page_source)
                print(f"已保存页面 {page + 1} 的HTML用于调试")

            # 保存到文件
            with open(output_file, "a") as file:
                for data_pid in data_pids:
                    file.write(data_pid + "\n")

            print(f"页面 {page + 1}/{total_pages} 提取完成，共找到 {len(data_pids)} 个产品ID")

            # 短暂休息，避免请求过快
            time.sleep(2)

        except Exception as e:
            print(f"页面 {page + 1}/{total_pages} 处理失败：{str(e)}")
            continue

finally:
    # 关闭浏览器
    driver.quit()

print(f"所有页面处理完成，结果已保存到 {output_file}")

hugoboss

import json
import csv
import requests
import time
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
import threading
import unicodedata

# Shopify 标准 CSV 格式字段
fields = [
    "Handle", "Title", "Body (HTML)", "Vendor", "Type", "Tags", "Published",
    "Option1 Name", "Option1 Value", "Option2 Name", "Option2 Value",
    "Option3 Name", "Option3 Value", "Variant SKU", "Variant Grams",
    "Variant Inventory Tracker", "Variant Inventory Qty", "Variant Inventory Policy",
    "Variant Fulfillment Service", "Variant Price", "Variant Compare At Price",
    "Variant Requires Shipping", "Variant Taxable", "Variant Barcode",
    "Image Src", "Image Position", "Image Alt Text", "Gift Card",
    "SEO Title", "SEO Description", "Google Shopping / Google Product Category",
    "Google Shopping / Gender", "Google Shopping / Age Group",
    "Google Shopping / MPN", "Google Shopping / AdWords Grouping",
    "Google Shopping / AdWords Labels", "Google Shopping / Condition",
    "Google Shopping / Custom Product", "Google Shopping / Custom Label 0",
    "Google Shopping / Custom Label 1", "Google Shopping / Custom Label 2",
    "Google Shopping / Custom Label 3", "Google Shopping / Custom Label 4",
    "Variant Image", "Variant Weight Unit", "Variant Tax Code",
    "Cost per item", "Status"
]

# 创建一个线程安全的写入队列
write_queue = Queue()
# 创建一个线程安全的计数器
counter_lock = threading.Lock()
processed_count = 0
total_urls = 0

def normalize_text(text):
    """处理特殊字符和乱码"""
    if not text:
        return ""
    # 将特殊字符转换为基本拉丁字符
    normalized = unicodedata.normalize('NFKD', text)
    # 移除组合字符
    normalized = ''.join(c for c in normalized if not unicodedata.combining(c))
    # 特殊字符映射
    char_map = {
        'ł': 'l', 'Ł': 'L',
        'ą': 'a', 'Ą': 'A',
        'ć': 'c', 'Ć': 'C',
        'ę': 'e', 'Ę': 'E',
        'ń': 'n', 'Ń': 'N',
        'ó': 'o', 'Ó': 'O',
        'ś': 's', 'Ś': 'S',
        'ź': 'z', 'Ź': 'Z',
        'ż': 'z', 'Ż': 'Z',
        '–': '-', '—': '-',
        '"': '"', '"': '"',
        ''': "'", ''': "'",
    }
    for old, new in char_map.items():
        normalized = normalized.replace(old, new)
    return normalized

def get_json_data(url):
    """从URL获取JSON数据"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error fetching data from {url}: {str(e)}")
        return None

def process_product(url):
    """处理单个商品数据"""
    global processed_count

    data = get_json_data(url)
    if not data:
        return

    try:
        # 获取产品数据
        product = data.get("product", {})
        if not product:
            return

        # 商品基础信息
        handle = product.get("id", "").strip()
        title = normalize_text(product.get("productName", ""))
        body_html = ""  # Tezenis doesn't provide detailed description in this API
        vendor = "Tezenis"
        product_type = normalize_text(product.get("categoryName", ""))
        published = "TRUE"

        # 价格信息
        price_data = product.get("price", {}).get("sales", {})
        variant_price = price_data.get("value", "")

        # 获取颜色信息
        variation_attrs = product.get("variationAttributes", [])
        colour = ""
        size_name = ""
        size_values = []
        for attr in variation_attrs:
            if attr.get("attributeId") == "color":
                values = attr.get("values", [])
                if values:
                    colour = normalize_text(values[0].get("displayValue", ""))
elif attr.get("displayName"):  # 获取尺码名称
                size_name = normalize_text(attr.get("displayName", ""))
                size_values = attr.get("values", [])

        # 商品图片信息
        images = []
        product_images = product.get("images", {}).get("large", [])
        for img in product_images:
            resized_urls = img.get("resizedUrls", [])
            image_url = ""
            for url in resized_urls:
                if "sw=2000" in url:
                    image_url = url
                    break
            if not image_url and resized_urls:  # 如果没有2000尺寸，使用第一个URL
                image_url = resized_urls[0]
            if image_url:
                images.append({
                    "src": image_url,
                    "position": len(images) + 1,
                    "alt": title
                })

        # 构造变体数据
        rows = []
        # 如果有尺码变体
        if size_values:
            for size_data in size_values:
                variant_size = normalize_text(size_data.get("id", ""))
                variant_sku = size_data.get("sku", "").strip()
                if not variant_sku:
                    variant_sku = f"{handle}-{variant_size}"

                # 为每个尺码创建基础数据
                base_row = {
                    "Handle": handle,
                    "Title": title,
                    "Body (HTML)": body_html,
                    "Vendor": vendor,
                    "Type": product_type,
                    "Tags": "",
                    "Published": published,
                    "Option1 Name": size_name,
                    "Option1 Value": variant_size,
                    "Option2 Name": "Color" if colour else "",
                    "Option2 Value": colour,
                    "Variant SKU": variant_sku,
                    "Variant Grams": "",
                    "Variant Inventory Tracker": "shopify",
                    "Variant Inventory Qty": 100,  # 默认库存量
                    "Variant Inventory Policy": "deny",
                    "Variant Fulfillment Service": "manual",
                    "Variant Price": variant_price,
                    "Variant Compare At Price": "",
                    "Variant Requires Shipping": "TRUE",
                    "Variant Taxable": "TRUE",
                    "Variant Barcode": "",
                    "Gift Card": "FALSE",
                    "SEO Title": title,
                    "SEO Description": body_html,
                    "Google Shopping / Google Product Category": "",
                    "Google Shopping / Gender": "Unisex",
                    "Google Shopping / Age Group": "Adult",
                    "Google Shopping / MPN": handle,
                    "Google Shopping / AdWords Grouping": "",
                    "Google Shopping / AdWords Labels": "",
                    "Google Shopping / Condition": "New",
                    "Google Shopping / Custom Product": "FALSE",
                    "Google Shopping / Custom Label 0": "",
                    "Google Shopping / Custom Label 1": "",
                    "Google Shopping / Custom Label 2": "",
                    "Google Shopping / Custom Label 3": "",
                    "Google Shopping / Custom Label 4": "",
                    "Variant Image": "",
                    "Variant Weight Unit": "kg",
                    "Variant Tax Code": "",
                    "Cost per item": "",
                    "Status": "active"
                }

                # 为每个图片创建一行数据
                for img in images:
row = base_row.copy()
                    row["Image Src"] = img["src"]
                    row["Image Position"] = img["position"]
                    row["Image Alt Text"] = img["alt"]
                    # 只在第一个图片行显示产品信息
                    if img["position"] > 1:
                        row["Title"] = ""
                        row["Body (HTML)"] = ""
                        row["Vendor"] = ""
                        row["Type"] = ""
                        row["Tags"] = ""
                        row["Option1 Name"] = ""
                        row["Option1 Value"] = ""
                        row["Option2 Name"] = ""
                        row["Option2 Value"] = ""
                        row["Variant SKU"] = ""
                        row["SEO Title"] = ""
                        row["SEO Description"] = ""
                    rows.append(row)

        else:
            # 如果没有变体，创建单个产品的基础数据
            base_row = {
                "Handle": handle,
                "Title": title,
                "Body (HTML)": body_html,
                "Vendor": vendor,
                "Type": product_type,
                "Tags": "",
                "Published": published,
                "Option1 Name": "Size" if size_name else "",
                "Option1 Value": size_name,
                "Option2 Name": "Color" if colour else "",
                "Option2 Value": colour,
                "Variant SKU": handle,
                "Variant Grams": "",
                "Variant Inventory Tracker": "shopify",
                "Variant Inventory Qty": 100,  # 默认库存量
                "Variant Inventory Policy": "deny",
                "Variant Fulfillment Service": "manual",
                "Variant Price": variant_price,
                "Variant Compare At Price": "",
                "Variant Requires Shipping": "TRUE",
                "Variant Taxable": "TRUE",
                "Variant Barcode": "",
                "Gift Card": "FALSE",
                "SEO Title": title,
                "SEO Description": body_html,
                "Google Shopping / Google Product Category": "",
                "Google Shopping / Gender": "Unisex",
                "Google Shopping / Age Group": "Adult",
                "Google Shopping / MPN": handle,
                "Google Shopping / AdWords Grouping": "",
                "Google Shopping / AdWords Labels": "",
                "Google Shopping / Condition": "New",
                "Google Shopping / Custom Product": "FALSE",
                "Google Shopping / Custom Label 0": "",
                "Google Shopping / Custom Label 1": "",
                "Google Shopping / Custom Label 2": "",
                "Google Shopping / Custom Label 3": "",
                "Google Shopping / Custom Label 4": "",
                "Variant Image": "",
                "Variant Weight Unit": "kg",
                "Variant Tax Code": "",
                "Cost per item": "",
                "Status": "active"
            }

            # 为每个图片创建一行数据
            for img in images:
                row = base_row.copy()
                row["Image Src"] = img["src"]
                row["Image Position"] = img["position"]
                row["Image Alt Text"] = img["alt"]
                # 只在第一个图片行显示产品信息
                if img["position"] > 1:
                    row["Title"] = ""
                    row["Body (HTML)"] = ""
                    row["Vendor"] = ""
                    row["Type"] = ""
                    row["Tags"] = ""
                    row["Option1 Name"] = ""
                    row["Option1 Value"] = ""
                    row["Option2 Name"] = ""
                    row["Option2 Value"] = ""
                    row["Variant SKU"] = ""
                    row["SEO Title"] = ""
                    row["SEO Description"] = ""
                rows.append(row)

        # 将处理好的行数据放入写入队列
        for row in rows:
            write_queue.put(row)

        # 更新进度
        with counter_lock:
            global processed_count
            processed_count += 1
            print(f"Progress: {processed_count}/{total_urls} - Processed: {url}")

    except KeyError as e:
        print(f"Missing key in data for {url}: {e}")
    except Exception as e:
        print(f"Error processing {url}: {e}")

def writer_thread(output_file):
    """CSV写入线程"""
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()

        while True:
            row = write_queue.get()
            if row is None:  # 结束信号
                break
            writer.writerow(row)
            csvfile.flush()  # 确保立即写入磁盘
            write_queue.task_done()

def main():
    output_file = "shopify_products.csv"

    # 读取链接文件
    with open("tezenis.txt", "r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]

    global total_urls
    total_urls = len(urls)
    print(f"Total URLs to process: {total_urls}")

    # 启动CSV写入线程
    writer = threading.Thread(target=writer_thread, args=(output_file,))
    writer.start()

    # 使用线程池处理URLs
    with ThreadPoolExecutor(max_workers=10) as executor:
        # 提交所有任务
        futures = [executor.submit(process_product, url) for url in urls]

        # 等待所有任务完成
        for future in as_completed(futures):
            try:
                future.result()  # 获取任务结果（如果有异常会在这里抛出）
            except Exception as e:
                print(f"Task failed: {e}")

    # 发送结束信号给写入线程
    write_queue.put(None)
    # 等待写入线程完成
    writer.join()

    print(f"All products data has been written to {output_file}")
    print(f"Total processed: {processed_count}/{total_urls}")

if __name__ == "__main__":
    main()

搜索结果

hugoboss-getpid copy

hugoboss