🛠️shopify采集脚本 - birkenstock
🎺 birkenstock

"""
https://www.birkenstock.com/us/styles/boston/?start=24&format=page-element&sz=24&loadMoreProducts=true

href_list = tree.xpath('//*[@class="color-swatch mobile-selectable "]/@href')
"""
import re
import os
import json
import threading
from queue import Queue, Empty
import requests
from lxml import etree
from fake_useragent import UserAgent
from loguru import logger
from concurrent.futures import ThreadPoolExecutor
from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log

# 初始化队列
first_queue = Queue()  # 存储分类页URL
second_queue = Queue()  # 存储详情页URL

# 配置日志
logger.remove()
logger.add("Record_url.log", rotation="500 MB", level="INFO")
logger.add(lambda msg: print(msg, end=""), level="INFO")

# User-Agent 生成器
ua = UserAgent()

# 线程安全的临时存储
temp_file_lock = threading.Lock()
TEMP_FILE = "temp_urls.jsonl"  # 临时存储商品 URL 的 JSONL 文件

# 输出文件和锁
OUTPUT_FILE = "all_products.json"
file_lock = threading.Lock()

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10),
       before_sleep=before_sleep_log(logger, logger.level("INFO").no))
def fetch_response(target_url):
    """获取网页响应，失败时重试3次"""
    headers = {"User-Agent": ua.random}
    response = requests.get(target_url, headers=headers, allow_redirects=False, timeout=10)
    response.raise_for_status()
    return response

def save_to_json(product_data):
    """线程安全地将产品数据保存到JSON文件"""
    with file_lock:
        if not os.path.exists(OUTPUT_FILE):
            with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
                json.dump([], f)
        try:
            with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
                existing_products = json.load(f)
        except (json.JSONDecodeError, FileNotFoundError):
            existing_products = []

        existing_products.append(product_data)  # 直接添加，假设每次清空文件
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            json.dump(existing_products, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved product: {product_data['Title']} (URL: {product_data['URL']})")

def parse_data():
    """解析详情页数据"""
    while True:
        try:
            item = second_queue.get(timeout=5)
            url, category, mark = item["url"], item["category"], item["mark"]
            try:
                response = fetch_response(url).text
                tree = etree.HTML(response)

                title = (re.findall('Product","name":"(.*?)","image"', response) or [f"{url} - No Title"])[0].strip()
                description = (re.findall('"description":"(.*?)","sku"', response) or [""])[0]
                price = (tree.xpath('//*[@class="price-standard"]/text()') or ["N/A"])[-1].strip()
                color = (tree.xpath('//*[@class="selection-text"]/text()') or ["As Show"])[-1].strip()
                img_list = tree.xpath('//*[@class="product-image-thumbnail-slick initially-hidden"]/img/@data-src')
                if not img_list or price == "N/A":
                    logger.warning(f"Parse Data Error Images and Price:{item}")
                    continue
                Size_list = tree.xpath('//*[@class="size-bottom"]/text()') or ["One Size"]
                size_list = [Size.strip() forSize in Size_list]
                other_links = []
                product_data = {
                    "Mark": mark,
                    "Title": title,
                    "Price": price,
                    "Color": color,
                    "Body": description,
                    "Img_list": img_list,
                    "Size_list": size_list,
                    "Other_link": other_links,
                    "Category": category,
                    "variant_image": img_list[0],
                    "URL": url
                }
                save_to_json(product_data)
            except Exception as e:
                logger.error(f'Parse Data Error:{item} -Exception:{e}')
            finally:
                second_queue.task_done()

        except Empty:
            logger.info("- - - - - - - - - - - - - 详细队列为空，退出解析线程")
            break

def deduplicate_and_queue():
    """从临时文件中去重并放入队列"""
    url_data = {}
    with open(TEMP_FILE, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            mark = data["mark"]
            if mark not in url_data:
                url_data[mark] = {"url": data["url"], "categories": set()}
            url_data[mark]["categories"].add(data["category"])

    for mark, data in url_data.items():
        second_queue.put({"url": data["url"], "category": ", ".join(data["categories"]), "mark": mark})
    print(f"- - - - - - - - - - - - - - 重复数据删除后的唯一产品总数: {len(url_data)}")
    # os.remove(TEMP_FILE)  # 删除临时文件

def get_classify():
    page_start = 0
    while 1:
        url = f"https://www.birkenstock.com/us/styles/boston/?start={page_start}&format=page-element&sz=24&loadMoreProducts=true"
        response = fetch_response(url).text
        tree = etree.HTML(response)
        href_list = tree.xpath(
            '//*[@class="color-swatch mobile-selectable "]/@href | //a[contains(@class,"product-tile has-hover-effect")]/@href')
        if not href_list:
            break
        with open(TEMP_FILE, "a", encoding="utf-8") as f:
            for href in href_list:
                if "https" not in href:
                    href = "https://www.birkenstock.com" + href
                data = {"mark": href, "url": href, "category": "Boston"}
                f.write(json.dumps(data) + "\n")  # JSONL 格式
        print(f"{url}  --- 完成 Items:{len(href_list)}")
        page_start += 24

def main():
    get_classify()
    deduplicate_and_queue()
    with ThreadPoolExecutor(max_workers=12) as executor:
        parse_futures = [executor.submit(parse_data) for _ in range(12)]
        for future in parse_futures:
            future.result()
    print("- - - - - - - - - - - - - - Crawling completed - - - - - - - - - - - - - -")

if __name__ == "__main__":
    main()
搜索结果