🛠️shopify采集脚本 - https://www.dickssportinggoods.com/f/mens-swimsuits?filterFacets=X_BRAND%253ANike
🍪 https://www.dickssportinggoods.com/f/mens-swimsuits?filterFacets=X_BRAND%253ANike

import random, csv, requests, logging, time, json, re
from selenium import webdriver
from copy import deepcopy
from time import sleep
from lxml import etree
from DrissionPage import Chromium, ChromiumOptions
from DrissionPage.common import Keys

# data['Body (HTML)'] = etree.tostring(product_html.xpath('//div[@class="rte"]')[0], pretty_print=True, method='html').decode('utf-8')
# 挂掉之后，main函数的run传一个参数True，然后取csv文件删掉最新产品的数据
# 可能有选择选项查看价格

class Dickssportinggoods:
    def __init__(self):
        self.url = 'https://www.dickssportinggoods.com/f/mens-swimsuits?filterFacets=X_BRAND%253ANike'
        self.headers = {}

        self.id = 0  # 保存到csv文件不用id字段
        self.init_data = {
            'Handle': '',
            'Title': '',
            'Body (HTML)': '',
            'Vendor': '',
            'Type': '',
            'Tags': '',
            'Published': 'TRUE',
            'Option1 Name': 'Color',
            'Option1 Value': '',
            'Option2 Name': 'Size',
            'Option2 Value': '',
            'Option3 Name': 'Width',
            'Option3 Value': '',
            'Variant SKU': '',
            'Variant Grams': '',
            'Variant Inventory Tracker': 'Shopify',
            'Variant Inventory Qty': '99999',
            'Variant Inventory Policy': '',
            'Variant Fulfillment Service': '',
            'Variant Price': '',
            'Variant Compare At Price': '',
            'Variant Requires Shipping': '',
            'Variant Taxable': '',
            'Variant Barcode': '',
            'Image Src': '',
            'Image Position': '',
            'Image Alt Text': '',
            'Gift Card': '',
            'SEO Title': '',
            'SEO Description': '',
            'Variant Image': '',
            'Status': '',
            'Collection': '',
        }
        self.empty_data = {
            'Handle': '',
            'Title': '',
            'Body (HTML)': '',
            'Vendor': '',
            'Type': '',
            'Tags': '',
            'Published': '',
            'Option1 Name': '',
            'Option1 Value': '',
            'Option2 Name': '',
            'Option2 Value': '',
            'Option3 Name': '',
            'Option3 Value': '',
            'Variant SKU': '',
            'Variant Grams': '',
            'Variant Inventory Tracker': '',
            'Variant Inventory Qty': '',
            'Variant Inventory Policy': '',
            'Variant Fulfillment Service': '',
            'Variant Price': '',
            'Variant Compare At Price': '',
            'Variant Requires Shipping': '',
            'Variant Taxable': '',
            'Variant Barcode': '',
            'Image Src': '',
            'Image Position': '',
            'Image Alt Text': '',
            'Gift Card': '',
            'SEO Title': '',
            'SEO Description': '',
            'Variant Image': '',
            'Status': '',
            'Collection': '',
        }
        self.field_names = ['Handle', 'Title', 'Body (HTML)', 'Vendor', 'Type', 'Tags', 'Published', 'Option1 Name',
                            'Option1 Value', 'Option2 Name', 'Option2 Value', 'Option3 Name', 'Option3 Value',
                            'Variant SKU', 'Variant Grams', 'Variant Inventory Tracker', 'Variant Inventory Qty',
                            'Variant Inventory Policy', 'Variant Fulfillment Service', 'Variant Price',
                            'Variant Compare At Price', 'Variant Requires Shipping', 'Variant Taxable',
                            'Variant Barcode', 'Image Src', 'Image Position', 'Image Alt Text', 'Gift Card',
                            'SEO Title', 'SEO Description', 'Variant Image', 'Status', 'Collection']
        self.file = None
        self.writer = None

        self.browser = None
        self.tab = None

    def simulated_smooth_scroll(self, driver, step=1000, interval=0.5, timeout=30):
        # 平滑移动到底部

        start_time = time.time()
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
        current_position = 0

        while time.time() - start_time < timeout:
            # 计算剩余滚动距离
            remaining = last_height - current_position

            # 动态调整步长
            current_step = min(step, remaining) if remaining > 0 else 0

            if current_step <= 0:
                break

            # 执行分步滚动
            driver.execute_script(f"window.scrollBy(0, {current_step})")
            current_position += current_step

            # 等待滚动和内容加载
            time.sleep(interval * (current_step / step))  # 动态间隔

            # 检查新高度
            new_height = driver.execute_script(
                "return document.documentElement.scrollHeight"
            )

            # 更新高度（处理动态加载）
            if new_height > last_height:
                last_height = new_height

    def get_driver(self, url, xpath_txt=None, is_turn=False):
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.page_load_strategy = "none"

        driver = webdriver.Chrome(options=options)

        driver.implicitly_wait(10)
        driver.maximize_window()

        while True:
            try:
                print('正在获取', url, '的页面数据')
                driver.get(url)

                if is_turn:
                    sleep(1)
                    self.simulated_smooth_scroll(driver)

                if xpath_txt:
                    driver.find_element('xpath', xpath_txt)
                else:
                    self.random_sleep(5)

                break

            except:
                print(url, '没定位到，重新请求...')

        # self.writer_to_file(driver.page_source, 'w', 'utf-8')

        return driver

    def driver_continue(self, driver, url, xpath_txt=None, is_turn=False):
        flag = True
        while flag:
            flag = False
            try:
                print('正在获取', url, '的页面数据')
                driver.get(url)

                if is_turn:
                    self.random_sleep()
                    self.simulated_smooth_scroll(driver)

                driver.find_element('xpath', xpath_txt)

            except:
                flag = True
                print(url, '没定位到，重新请求...')

        # self.writer_to_file(driver.page_source, 'w', 'utf-8')

    def get_page_html(self, url, xpath_txt=None, is_turn=False):
        driver = self.get_driver(url, xpath_txt, is_turn=is_turn)

        page_source = driver.page_source

        driver.close()

        return etree.HTML(page_source)

    def writer_to_file(self, data, mode, encoding=None):
        if 'b' in encoding:
            open('./text.html', mode).write(data)
        else:
            open('./text.html', mode, encoding=encoding).write(data)

        print('写入文件成功！')

    def driver_click(self, driver, timeout=2):
        driver.click()
        self.random_sleep(timeout)

    def driver_back(self, driver, timeout=2):
        driver.back()
        self.random_sleep(timeout)

    def driver_refresh(self, driver, timeout=2):
        driver.refresh()
        self.random_sleep(timeout)

    def tab_wait(self, tab, timeout=3):
        tab.wait(timeout)
        return tab

    def get_dp_html(self, tab, url, xpath_txt='html', is_turn=False):
        print('正在获取', url, '的数据')

        while True:
            tab.get(url)
            sleep(5)

            if is_turn:
                tab.scroll.to_bottom()
                self.tab_wait(tab)

            ele = tab.ele(f'x:{xpath_txt}')
            if ele:
                break

            print('没有请求到元素，重新请求中')

        res = etree.HTML(tab.html)

        return res

    def get_detail_tab(self, url, xpath_txt='html'):
        print('正在获取', url, '的数据')

        tab = self.browser.latest_tab
        tab.set.window.max()

        while True:
            tab.get(url)

            while True:
                try:
                    tab.run_js('window.scrollTo(0, 2500);')
                    break
                except Exception as e:
                    print(e)

            try:
                ele = tab.ele(f'x:{xpath_txt}')
                if ele:
                    break
            except Exception as e:
                print('get_detail_tab里的查找xpath报错')
                print(e)

            print('没有请求到元素，重新请求中')

        while True:
            try:
                tab.run_js('window.scrollTo(0, -2500);')
                break
            except Exception as e:
                print(e)

        return tab

    def ele_click(self, tab, ele):
        while True:
            try:
                tab.actions.click(ele)
                break
            except Exception as e:
                print(e)
                tab.wait(1)

    def dp_click_ad(self, tab, xpath_txt):
        t_ele = tab.ele(f'x:{xpath_txt}', timeout=3)
        if t_ele:
            self.ele_click(tab, t_ele)
            self.tab_wait(tab, 1)

    def random_sleep(self, timeout=2):
        sleep(random.random() + timeout)

    def save_csv(self, data):
        self.writer.writerow({
            'Handle': data['Handle'],
            'Title': data['Title'],
            'Body (HTML)': data['Body (HTML)'],
            'Vendor': data['Vendor'],
            'Type': data['Type'],
            'Tags': data['Tags'],
            'Published': data['Published'],
            'Option1 Name': data['Option1 Name'],
            'Option1 Value': data['Option1 Value'],
            'Option2 Name': data['Option2 Name'],
            'Option2 Value': data['Option2 Value'],
            'Option3 Name': data['Option3 Name'],
            'Option3 Value': data['Option3 Value'],
            'Variant SKU': data['Variant SKU'],
            'Variant Grams': data['Variant Grams'],
            'Variant Inventory Tracker': data['Variant Inventory Tracker'],
            'Variant Inventory Qty': data['Variant Inventory Qty'],
            'Variant Inventory Policy': data['Variant Inventory Policy'],
            'Variant Fulfillment Service': data['Variant Fulfillment Service'],
'Variant Price': data['Variant Price'],
            'Variant Compare At Price': data['Variant Compare At Price'],
            'Variant Requires Shipping': data['Variant Requires Shipping'],
            'Variant Taxable': data['Variant Taxable'],
            'Variant Barcode': data['Variant Barcode'],
            'Image Src': data['Image Src'],
            'Image Position': data['Image Position'],
            'Image Alt Text': data['Image Alt Text'],
            'Gift Card': data['Gift Card'],
            'SEO Title': data['SEO Title'],
            'SEO Description': data['SEO Description'],
            'Variant Image': data['Variant Image'],
            'Status': data['Status'],
            'Collection': data['Collection']
        })

    def get_response(self, url):
        print('正在获取', url, '的数据')

        while True:
            try:
                response = requests.get(url, headers=self.headers)
                break
            except:
                print('没有请求到，重新请求')
                self.random_sleep()

        self.random_sleep()

        return response

    def tab_run_js(self, tab, js_code):
        while True:
            try:
                tab.run_js(js_code)
                break
            except Exception as e:
                print('捕获tab_run_js方法的run_js:', e)
                tab.wait(2)

    def get_html(self, url):
        response = self.get_response(url)
        return etree.HTML(response.text)

    def parse(self, url):
        data = deepcopy(self.init_data)

        data['Type'] = "Women's Swimsuits"
        data['Collection'] = data['Type']

        page = 0

        self.get_dp_html(self.tab, url, '//div[contains(@class, "pro-tips-carousel-item")]')  # 第1个请求

        while True:
            index_html = etree.HTML(self.tab.html)

            is_next = False
            if len(index_html.xpath('//button[@title="Next Page"]')) > 0:
                is_next = True

            print('is_next:', is_next)

            product_url_list = index_html.xpath('//div[@class="product-card-image hmf-mt-xxs hmf-mb-xs"]/a[@class="image"]/@href')

            for product_url in product_url_list:
                self.id += 1

                if 'https:' not in product_url:
                    product_url = 'https://www.dickssportinggoods.com' + product_url

                product_url = product_url.replace(' ', '/')

                with open('./filter.txt', 'r', encoding='utf-8') as f:
                    filter_txt = f.read()

                if product_url in filter_txt:
                    print(self.id, '已完成')
                    continue

                print(self.id, '开始')

                self.tab = self.get_detail_tab(product_url, '//div[@data-testid="product-info-specs"]')  # 第2个请求

                color_ele_list = self.tab.eles('x://div[@class="sliding-row-inner hmf-grid hmf-pb-xs"]//button[contains(@class, "pdp-color-swatch-selected")]/img')

                product_color_list = []
                for color_ele in color_ele_list:
                    product_color_list.append(color_ele.attr('alt').strip())

                old_color = product_url.split('color=')[-1]
                for product_color in product_color_list:
                    data['Option1 Value'] = product_color

                    color_txt = product_color.replace(' ', '/')

                    product_color_url = product_url.replace(f'color={old_color}', f'color={color_txt}') # 尺寸可能不是/分隔，可能是空格分隔

                    self.tab = self.get_detail_tab(product_color_url, '//div[@data-testid="product-info-specs"]')  # 第3个请求

                    data['Title'] = self.tab.ele('x://h1[@class="hmf-header-bold-m hmf-header-bold-m-l"]').text.strip()
                    data['Handle'] = data['Title'].lower().replace(' ', '-')
                    data['Body (HTML)'] = self.tab.ele('x://div[@data-testid="product-info-specs"]').html

                    temp_ele = self.tab.ele('x://div[@class="hmf-mb-xxxs hmf-header-bold-l"]/span')

                    if temp_ele:
                        data['Variant Price'] = temp_ele.text.strip('$')
                    else:
                        try:
                            data['Variant Price'] = self.tab.ele('x://span[@class="product-price"]').text.strip('$')
                            data['Variant Compare At Price'] = self.tab.ele('x://span[@class="price-list"]/span').text.strip('$')
                        except:
                            try:
                                data['Variant Price'] = self.tab.ele('x://span[@class="hmf-mb-xxxs hmf-header-bold-l price-list"]/span').text.strip('$')
                            except:
                                data['Variant Price'] = '999999'

                    # ------------------------------------------------------
                    # 图片
                    product_img_url_list = []
                    product_img_url_len = 0

                    while True:
                        flag_t = True

                        while True:
                            try:
                                self.tab.run_js('window.scrollTo(0, 1500);')
                                break
                            except Exception as e:
                                print(e)

                        self.tab_wait(self.tab, 1)

                        cnt = 0
                        all_img_cnt = 0
                        click_text = ''

                        while True:
                            cnt += 1

                            self.dp_click_ad(self.tab, '//i[@class="fa fa-times font-size-20"]')
                            self.dp_click_ad(self.tab, '//button[@data-event="click/close"]')
                            self.tab_wait(self.tab, 1)

                            temp_ele = self.tab.ele('x://button[@class="hmf-button hmf-label-m medium-btn secondary-btn"]')
                            print(temp_ele)

                            if temp_ele:
                                click_text = self.tab.ele('x://button[@class="hmf-button hmf-label-m medium-btn secondary-btn"]//span[@class="hmf-button-container"]//span').text

                            print('点击的文本：', click_text)

                            if 'All' in click_text:
                                all_img_cnt = int(click_text.strip().split(' ')[-2])
                                self.tab.actions.click(temp_ele)
                                print('点击了加载图片按钮， 尝试点击的第', cnt, '下')
                            elif 'Less' in click_text:
                                break

                            if cnt >= 5:
                                if not temp_ele:
                                    print('可能真没有扩展图片')
                                    break

                                print('没点到扩展，重新请求')
                                flag_t = False
                                break

                        if not flag_t:

                            self.tab = self.get_detail_tab(product_color_url, '//div[@data-testid="product-info-specs"]')  # 第4个请求

                            continue

                        self.tab_wait(self.tab, 6)

                        while True:
product_img_url_list = self.tab.eles('x://pdp-product-image//picture//source[1]')

                            product_img_url_len = len(product_img_url_list)

                            print('所有的图片个数应为：', all_img_cnt)
                            print('获取到的图片个数为：', product_img_url_len)

                            if cnt >= 5 or product_img_url_len == all_img_cnt:
                                break
                            else:
                                print('没获取到所有的图片，重新获取中')
                                flag_t = False
                                break

                        self.tab.run_js('window.scrollTo(0, -1500);')

                        if product_img_url_len >= 5 or flag_t:
                            break
                        else:
                            self.tab = self.get_detail_tab(product_color_url, '//div[@class="hmf-mx-0 hmf-px-xs hmf-py-s hmf-mb-s hmf-mb-md-l"]')  # 第5个请求

                    # ------------------------------------------------------

                    product_size_ele_list = self.tab.eles('x://div[@class="hmf-grid selector-attribute-outer overflow-scroll"]//span')
                    if len(product_size_ele_list) <= 0:
                        tt_ele = self.tab.ele('x://p[@class="hmf-pt-xs"]/pdp-attributes-components-base-attribute-label[contains(./span[1], "Size")]/span[@class="hmf-header-xs hmf-text-transform-none"]')

                        product_size_ele_list.append(tt_ele)

                    print('尺寸个数为：', len(product_size_ele_list))

                    product_width_ele_list = self.tab.eles('x://div[@class="hmf-grid selector-attribute-outer overflow-scroll"]//span[@class="width"]')

                    if len(product_width_ele_list) <= 0:
                        width_ele_t = self.tab.ele('x://p[@class="hmf-pt-xs"]/pdp-attributes-components-base-attribute-label[./span[contains(text(), "Width")]]/span[@class="hmf-header-xs hmf-text-transform-none"]')

                        if width_ele_t:
                            product_width_ele_list.append(width_ele_t)
                        else:
                            product_width_ele_list.append('One Width')

                    print('宽度个数为：', len(product_width_ele_list))

                    for product_size_ele in product_size_ele_list:
                        if product_size_ele:
                            data['Option2 Value'] = product_size_ele.text.strip()
                            if data['Option2 Value'] == '':
                                data['Option2 Value'] = 'One Size'
                        else:
                            data['Option2 Value'] = 'One Size'

                        for product_width_ele in product_width_ele_list:
                            if type(product_width_ele) == type('a'):
                                data['Option3 Value'] = product_width_ele
                            else:
                                data['Option3 Value'] = product_width_ele.text.strip()

                            data['Variant SKU'] = data['Handle'] + '-' + data['Option2 Value'] + '-' + data['Option3 Value']

                            print('第', self.id, '个产品', product_color_url, '的', data['Option1 Value'], '色的图片个数为：', product_img_url_len)

                            try:
                                data['Image Src'] = product_img_url_list[0].attr('srcset')
                            except:
                                try:
                                    data['Image Src'] = product_img_url_list[0].attr('src')
                                except:
                                    break

data['Image Position'] = 1
                            data['Variant Image'] = data['Image Src']

                            self.save_csv(data)
                            print(data)

                            for i in range(1, product_img_url_len):
                                temp_data = deepcopy(self.empty_data)

                                temp_data['Handle'] = data['Handle']
                                temp_data['Published'] = 'TRUE'
                                temp_data['Image Src'] = product_img_url_list[i].attr('srcset')
                                temp_data['Image Position'] = i + 1

                                self.save_csv(temp_data)
                                print(temp_data)

                    self.tab_wait(self.tab, 1)

                print(self.id, '结束')

                with open('./filter.txt', 'a', encoding='utf-8') as f:
                    f.write(product_url + '\n')

            if is_next:
                self.get_dp_html(self.tab, url, '//div[contains(@class, "pro-tips-carousel-item")]')

                page += 1
                for i in range(page):
                    self.tab.scroll.to_bottom()
                    self.tab.wait(2)
                    self.tab_run_js(self.tab, 'window.scrollBy(0, -2500)')
                    self.tab.wait(2)
                    self.ele_click(self.tab, self.tab.ele('x://button[@title="Next Page"]'))
                    self.tab.wait(3)
            else:
                break

    def init_tab(self):
        co = ChromiumOptions()
        co.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
        co.auto_port()
        # co.headless()
        # co.set_argument('--no-sandbox')

        self.browser = Chromium(co)

        self.tab = self.browser.latest_tab
        self.tab.set.window.max()

    def run(self, is_continue=False):
        if is_continue:
            self.file = open('./men_swimsuits_dickssportinggoods.csv', 'a', newline='', encoding='utf-8-sig')
            self.writer = csv.DictWriter(self.file, fieldnames=self.field_names)
        else:
            self.file = open('./men_swimsuits_dickssportinggoods.csv', 'w', newline='', encoding='utf-8-sig')
            self.writer = csv.DictWriter(self.file, fieldnames=self.field_names)
            self.writer.writeheader()

            with open('./filter.txt', 'w', encoding='utf-8') as f:
                f.write('')

        logging.captureWarnings(True)

        self.init_tab()

        self.parse(self.url)

        if self.file:
            self.file.close()

        if self.browser:
            self.browser.quit()

if __name__ == '__main__':
    dickssportinggoods = Dickssportinggoods()
    dickssportinggoods.run()

# https://www.dickssportinggoods.com/f/mens-swimsuits?filterFacets=X_BRAND%253ANike

# 上传代码的时候把True给去掉 √
# 试试全部产品数据 √
# 试试部分产品数据 √
搜索结果