🛠️shopify采集脚本 - prodirectsport

🎷 prodirectsport

prodirectsport_getpid

import requests
import json
import time
from datetime import datetime

def get_product_urls(page):
    url = f"https://www.prodirectsport.com/api/v1/search?location=/soccer/l/mens/departments-clothing/brand-nike/type-tops/?pg={page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            if "products" in data and data["products"]:
                return [f"https://www.prodirectsport.com{product['url']}" for product in data["products"]]
        return []
    except Exception as e:
        print(f"Error fetching page {page}: {str(e)}")
        return []

def main():
    page = 1
    all_urls = []

    while True:
        print(f"Fetching page {page}...")
        urls = get_product_urls(page)

        if not urls:
            print(f"No more products found on page {page}. Stopping.")
            break

        all_urls.extend(urls)
        print(f"Found {len(urls)} products on page {page}")

        # Save URLs to file
        with open("prodirectsport_urls.txt", "w", encoding="utf-8") as f:
            for url in all_urls:
                f.write(url + "\n")

        page += 1
        time.sleep(2)  # Add delay between requests

    print(f"Total URLs collected: {len(all_urls)}")

if __name__ == "__main__":
    main()

prodirectsport

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import csv
from datetime import datetime
import re
from selenium.webdriver.edge.options import Options
import os
import threading
from queue import Queue
import threading

def extract_data_from_script(script_content):
    try:
        # 直接解析JSON数据
        data = json.loads(script_content)
        product_data = data.get('product', {})

        # 获取尺码数据
        sizes = []
        for size in product_data.get('sizes', []):
            # 获取UK尺码
            size_value = size.get('values', {}).get('GB', '')
            if size_value:
                sizes.append(size_value)

        return {
            'colourway': product_data.get('colourway', ''),
            'sizes': sizes,
            'price': product_data.get('price', {}).get('currentPriceValue', '')
        }
    except Exception as e:
        print(f"\nError parsing script data: {e}")
        import traceback
        print(traceback.format_exc())
    return {'colourway': '', 'sizes': [], 'price': ''}

def process_product_data(json_data, additional_data):
    # 解析额外的script数据
    extra_data = extract_data_from_script(additional_data)

    # 创建一个产品数据字典
    product = {
        "Handle": json_data.get("sku", "").lower(),
        "Title": json_data.get("name", ""),
        "Body (HTML)": json_data.get("description", ""),
        "Vendor": json_data.get("brand", ""),
        "Type": "Clothing",
        "Tags": f"colourway:{extra_data['colourway']}" if extra_data['colourway'] else "",
        "Published": "TRUE",
        "Option1 Name": "Size",
        "Option1 Value": "",
        "Option2 Name": "Color",
        "Option2 Value": extra_data['colourway'],
        "Option3 Name": "",
        "Option3 Value": "",
        "Variant SKU": "",
        "Variant Grams": "0",
        "Variant Inventory Tracker": "shopify",
        "Variant Inventory Qty": "1",
        "Variant Inventory Policy": "deny",
        "Variant Fulfillment Service": "manual",
        "Variant Price": str(extra_data['price']),
        "Variant Compare At Price": "",
        "Variant Requires Shipping": "TRUE",
        "Variant Taxable": "TRUE",
        "Variant Barcode": "",
        "Image Src": "",
        "Image Position": "",
        "Image Alt Text": "",
        "Gift Card": "FALSE",
        "SEO Title": "",
        "SEO Description": "",
        "Google Shopping / Google Product Category": "",
        "Google Shopping / Gender": "",
        "Google Shopping / Age Group": "",
        "Google Shopping / MPN": json_data.get("mpn", ""),
        "Google Shopping / AdWords Grouping": "",
        "Google Shopping / AdWords Labels": "",
        "Google Shopping / Condition": "",
        "Google Shopping / Custom Product": "",
        "Google Shopping / Custom Label 0": "",
        "Google Shopping / Custom Label 1": "",
        "Google Shopping / Custom Label 2": "",
        "Google Shopping / Custom Label 3": "",
        "Google Shopping / Custom Label 4": "",
        "Variant Image": "",
        "Variant Weight Unit": "g",
        "Variant Tax Code": "",
        "Cost per item": "",
        "Status": "active"
    }

    # 处理变体数据
    variants = []
    if extra_data['sizes']:  # 使用从script中获取的尺码数据
        for size in extra_data['sizes']:
            variant = product.copy()
            variant["Option1 Value"] = size
            variant["Variant SKU"] = f"{product['Handle']}-{size}"
            variants.append(variant)

    # 处理图片
    if "image" in json_data and isinstance(json_data["image"], list):
        for idx, img in enumerate(json_data["image"], 1):
            if idx == 1 and variants:
                variants[0]["Image Src"] = img
                variants[0]["Image Position"] = str(idx)
            else:
                variant = variants[0].copy() if variants else product.copy()
                variant["Image Src"] = img
                variant["Image Position"] = str(idx)
                variant["Variant SKU"] = ""
                variant["Option1 Value"] = ""
                variants.append(variant)

    return variants

def get_product_data(url):
    driver = None
    try:
        # 设置Edge选项
        edge_options = Options()
        # edge_options.add_argument('--headless')  # 启用无头模式
        edge_options.add_argument('--disable-gpu')
        edge_options.add_argument('--no-sandbox')
        edge_options.add_argument('--disable-dev-shm-usage')
        edge_options.add_argument('--disable-logging')
        edge_options.add_argument('--log-level=3')
        edge_options.add_argument('--window-size=1920,1080')

        # 创建Edge浏览器实例
        driver = webdriver.Edge(options=edge_options)
        driver.set_page_load_timeout(60)  # 增加页面加载超时时间到60秒

        # 访问URL
        print(f"\nProcessing: {url}")
        driver.get(url)

        # 等待页面加载
        time.sleep(5)  # 先等待5秒

        # 等待JSON-LD script标签出现
        wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'script[type="application/ld+json"]')))

        # 再等待5秒确保其他内容加载完成
        time.sleep(5)

        # 获取JSON-LD数据
        js_script = """
        let jsonLd = document.querySelector('script[type="application/ld+json"]');
        return jsonLd ? jsonLd.textContent : 'JSON-LD not found';
        """
        content = driver.execute_script(js_script)

        # 获取指定xpath的script内容
        js_script2 = """
        let script = document.evaluate('/html/body/main/div[1]/div[3]/script', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
        return script ? script.textContent : 'Script not found';
        """
        content2 = driver.execute_script(js_script2)

        # 处理数据并返回结果
        if content != 'JSON-LD not found' and content2 != 'Script not found':
            json_data = json.loads(content)
            variants = process_product_data(json_data, content2)
            if variants:
                print(f"Successfully processed: {url}")
                return variants
            else:
                print(f"No variants data for: {url}")
                return None
        else:
            print(f"No script data found for: {url}")
            return None

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

    finally:
        if driver:
            try:
                driver.quit()
            except:
                pass  # 忽略关闭时的错误

def worker(url_queue, csv_writer, csv_lock, success_urls):
    while True:
        try:
            url = url_queue.get_nowait()
            if url is None:
                break

            variants = get_product_data(url)
            if variants:
                with csv_lock:
                    csv_writer.writerows(variants)
                    success_urls.append(url)

        except Queue.Empty:
            break
        finally:
            url_queue.task_done()

def main():
    urls_file = 'prodirectsport_urls.txt'
    # 读取所有URL
    with open(urls_file, 'r', encoding='utf-8') as f:
        urls = [line.strip() for line in f if line.strip()]

    print(f"Total URLs to process: {len(urls)}")

    # 创建CSV文件
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f"prodirectsport_products_{timestamp}.csv"

    # 创建URL队列
    url_queue = Queue()
    for url in urls:
        url_queue.put(url)

    # 创建线程锁和成功URL列表
    csv_lock = threading.Lock()
    success_urls = []

    # 打开CSV文件
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        # 创建CSV writer
        fieldnames = ["Handle", "Title", "Body (HTML)", "Vendor", "Type", "Tags", "Published", 
                     "Option1 Name", "Option1 Value", "Option2 Name", "Option2 Value", 
                     "Option3 Name", "Option3 Value", "Variant SKU", "Variant Grams", 
                     "Variant Inventory Tracker", "Variant Inventory Qty", "Variant Inventory Policy",
                     "Variant Fulfillment Service", "Variant Price", "Variant Compare At Price",
                     "Variant Requires Shipping", "Variant Taxable", "Variant Barcode",
                     "Image Src", "Image Position", "Image Alt Text", "Gift Card",
                     "SEO Title", "SEO Description", "Google Shopping / Google Product Category",
                     "Google Shopping / Gender", "Google Shopping / Age Group",
"Google Shopping / MPN", "Google Shopping / AdWords Grouping",
                     "Google Shopping / AdWords Labels", "Google Shopping / Condition",
                     "Google Shopping / Custom Product", "Google Shopping / Custom Label 0",
                     "Google Shopping / Custom Label 1", "Google Shopping / Custom Label 2",
                     "Google Shopping / Custom Label 3", "Google Shopping / Custom Label 4",
                     "Variant Image", "Variant Weight Unit", "Variant Tax Code",
                     "Cost per item", "Status"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # 创建并启动线程
        threads = []
        for _ in range(1):  # 5个线程
            t = threading.Thread(target=worker, args=(url_queue, writer, csv_lock, success_urls))
            t.start()
            threads.append(t)

        # 等待所有线程完成
        for t in threads:
            t.join()

        # 确保所有数据都写入文件
        csvfile.flush()

    # 更新URL文件，删除成功处理的URL
    remaining_urls = [url for url in urls if url not in success_urls]
    with open(urls_file, 'w', encoding='utf-8') as f:
        for url in remaining_urls:
            f.write(url + '\n')

    print(f"\nProcessing completed:")
    print(f"Total URLs: {len(urls)}")
    print(f"Successfully processed: {len(success_urls)}")
    print(f"Failed URLs: {len(remaining_urls)}")
    print(f"Data saved to: {csv_filename}")
    print(f"Remaining URLs saved to: {urls_file}")

if __name__ == "__main__":
    main()

搜索结果

prodirectsport_getpid

prodirectsport