🛠️shopify采集脚本 - https://www.dickssportinggoods.com/f/rods-reels-and-combos?filterFacets=5382%253ARod%2520%2526%2520Reel%2520Combos
🪘 https://www.dickssportinggoods.com/f/rods-reels-and-combos?filterFacets=5382%253ARod%2520%2526%2520Reel%2520Combos

import random, csv, requests, logging, time, json, re, execjs
from selenium import webdriver
from copy import deepcopy
from time import sleep, time
from lxml import etree
from DrissionPage import Chromium, ChromiumOptions

class Dickssportinggoods:
    def __init__(self):
        self.url = 'https://www.dickssportinggoods.com/f/rods-reels-and-combos?filterFacets=5382%253ARod%2520%2526%2520Reel%2520Combos'
        self.headers = {}

        self.id = 0  # 保存到csv文件不用id字段
        self.init_data = {
            'Handle': '',
            'Title': '',
            'Body (HTML)': '',
            'Vendor': '',
            'Type': '',
            'Tags': '',
            'Published': 'TRUE',
            'Option1 Name': '',
            'Option1 Value': '',
            'Option2 Name': '',
            'Option2 Value': '',
            'Option3 Name': '',
            'Option3 Value': '',
            'Option4 Name': '',
            'Option4 Value': '',
            'Option5 Name': '',
            'Option5 Value': '',
            'Variant SKU': '',
            'Variant Grams': '',
            'Variant Inventory Tracker': 'Shopify',
            'Variant Inventory Qty': '99999',
            'Variant Inventory Policy': '',
            'Variant Fulfillment Service': '',
            'Variant Price': '',
            'Variant Compare At Price': '',
            'Variant Requires Shipping': '',
            'Variant Taxable': '',
            'Variant Barcode': '',
            'Image Src': '',
            'Image Position': '',
            'Image Alt Text': '',
            'Gift Card': '',
            'SEO Title': '',
            'SEO Description': '',
            'Variant Image': '',
            'Status': '',
            'Collection': '',
        }
        self.empty_data = {
            'Handle': '',
            'Title': '',
            'Body (HTML)': '',
            'Vendor': '',
            'Type': '',
            'Tags': '',
            'Published': '',
            'Option1 Name': '',
            'Option1 Value': '',
            'Option2 Name': '',
            'Option2 Value': '',
            'Option3 Name': '',
            'Option3 Value': '',
            'Option4 Name': '',
            'Option4 Value': '',
            'Option5 Name': '',
            'Option5 Value': '',
            'Variant SKU': '',
            'Variant Grams': '',
            'Variant Inventory Tracker': '',
            'Variant Inventory Qty': '',
            'Variant Inventory Policy': '',
            'Variant Fulfillment Service': '',
            'Variant Price': '',
            'Variant Compare At Price': '',
            'Variant Requires Shipping': '',
            'Variant Taxable': '',
            'Variant Barcode': '',
            'Image Src': '',
            'Image Position': '',
            'Image Alt Text': '',
            'Gift Card': '',
            'SEO Title': '',
            'SEO Description': '',
            'Variant Image': '',
            'Status': '',
            'Collection': '',
        }
        self.field_names = ['Handle', 'Title', 'Body (HTML)', 'Vendor', 'Type', 'Tags', 'Published', 'Option1 Name',
                            'Option1 Value', 'Option2 Name', 'Option2 Value', 'Option3 Name', 'Option3 Value', 'Option4 Name', 'Option4 Value', 'Option5 Name', 'Option5 Value',
                            'Variant SKU', 'Variant Grams', 'Variant Inventory Tracker', 'Variant Inventory Qty',
                            'Variant Inventory Policy', 'Variant Fulfillment Service', 'Variant Price',
                            'Variant Compare At Price', 'Variant Requires Shipping', 'Variant Taxable',
                            'Variant Barcode', 'Image Src', 'Image Position', 'Image Alt Text', 'Gift Card',
                            'SEO Title', 'SEO Description', 'Variant Image', 'Status', 'Collection']
        self.file = None
        self.writer = None

        self.browser = None
        self.tab = None

        self.cnt = 0

        self.total_product_num = 0
        self.total_page = 0

        self.sku_id = 0
        self.img_flag = True

        self.row_path = []
        self.col_path = []

    def simulated_smooth_scroll(self, driver, step=1000, interval=0.5, timeout=30):
        # 平滑移动到底部

        start_time = time.time()
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
        current_position = 0

        while time.time() - start_time < timeout:
            # 计算剩余滚动距离
            remaining = last_height - current_position

            # 动态调整步长
            current_step = min(step, remaining) if remaining > 0 else 0

            if current_step <= 0:
                break

            # 执行分步滚动
            driver.execute_script(f"window.scrollBy(0, {current_step})")
            current_position += current_step

            # 等待滚动和内容加载
            time.sleep(interval * (current_step / step))  # 动态间隔

            # 检查新高度
            new_height = driver.execute_script(
                "return document.documentElement.scrollHeight"
            )

            # 更新高度（处理动态加载）
            if new_height > last_height:
                last_height = new_height

    def get_driver(self, url, xpath_txt=None, is_turn=False):
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.page_load_strategy = "none"

        driver = webdriver.Chrome(options=options)

        driver.implicitly_wait(10)
        driver.maximize_window()

        while True:
            try:
                print('正在获取', url, '的页面数据')
                driver.get(url)

                if is_turn:
                    sleep(1)
                    self.simulated_smooth_scroll(driver)

                if xpath_txt:
                    driver.find_element('xpath', xpath_txt)
                else:
                    self.random_sleep(5)

                break

            except:
                print(url, '没定位到，重新请求...')

        # self.writer_to_file(driver.page_source, 'w', 'utf-8')

        return driver

    def driver_continue(self, driver, url, xpath_txt=None, is_turn=False):
        flag = True
        while flag:
            flag = False
            try:
                print('正在获取', url, '的页面数据')
                driver.get(url)

                if is_turn:
                    self.random_sleep()
                    self.simulated_smooth_scroll(driver)

                driver.find_element('xpath', xpath_txt)

            except:
                flag = True
                print(url, '没定位到，重新请求...')

        # self.writer_to_file(driver.page_source, 'w', 'utf-8')

    def get_page_html(self, url, xpath_txt=None, is_turn=False):
        driver = self.get_driver(url, xpath_txt, is_turn=is_turn)

        page_source = driver.page_source

        driver.close()

        return etree.HTML(page_source)

    def writer_to_file(self, data, mode, encoding=None):
        if 'b' in encoding:
            open('./text.html', mode).write(data)
        else:
            open('./text.html', mode, encoding=encoding).write(data)

        print('写入文件成功！')

    def driver_click(self, driver, timeout=2):
        driver.click()
        self.random_sleep(timeout)

    def driver_back(self, driver, timeout=2):
        driver.back()
        self.random_sleep(timeout)

    def driver_refresh(self, driver, timeout=2):
        driver.refresh()
        self.random_sleep(timeout)

    def tab_wait(self, tab, timeout=3):
        tab.wait(timeout)
        return tab

    def tab_get(self, tab, url, xpath_txt='html', backup_xpath_txt='html', is_turn=False, timeout=3, off_xpath='//h1', off_txt='Oops, Something Went Wrong.', wait_minute=15):
        while True:
            print('正在获取', url, '的数据')
            tab.get(url)
            self.tab_wait(tab, timeout)

            if off_xpath and off_txt:
                try:
                    t_ele = tab.ele(f'x:{off_xpath}')
                except Exception as e:
                    print('off_xpath这里错了')
                    print(e)
                    tab.wait(timeout)
                    continue

                if t_ele and off_txt in t_ele.text:
                    print(f'ip被封了，等待{wait_minute}分钟再尝试请求')
                    sleep(60 * wait_minute)
                    continue

            if is_turn:
                tab.scroll.to_bottom()
                self.tab_wait(tab, timeout)

            ele = tab.ele(f'x:{xpath_txt}')
            if ele:
                print('第1个xpath找到的')
                break
            else:
                if backup_xpath_txt != 'html':
                    t_ele = tab.ele(f'x:{backup_xpath_txt}')
                    if t_ele:
                        print('第2个xpath找到的')
                        break

            print('没有请求到元素，重新请求中')
            tab.wait(timeout)

        return tab

    def get_dp_html(self, tab, url, xpath_txt='html', is_turn=False):
        tab = self.tab_get(tab, url, xpath_txt=xpath_txt, is_turn=is_turn)

        res = etree.HTML(tab.html)
        sleep(2)

        return res

    def random_sleep(self, timeout=2):
        sleep(random.random() + timeout)

    def save_csv(self, data):
        self.writer.writerow({
            'Handle': data['Handle'],
            'Title': data['Title'],
            'Body (HTML)': data['Body (HTML)'],
            'Vendor': data['Vendor'],
            'Type': data['Type'],
            'Tags': data['Tags'],
            'Published': data['Published'],
            'Option1 Name': data['Option1 Name'],
            'Option1 Value': data['Option1 Value'],
            'Option2 Name': data['Option2 Name'],
            'Option2 Value': data['Option2 Value'],
            'Option3 Name': data['Option3 Name'],
            'Option3 Value': data['Option3 Value'],
            'Option4 Name': data['Option4 Name'],
            'Option4 Value': data['Option4 Value'],
            'Option5 Name': data['Option5 Name'],
            'Option5 Value': data['Option5 Value'],
            'Variant SKU': data['Variant SKU'],
            'Variant Grams': data['VariantGrams'],
            'Variant Inventory Tracker': data['Variant Inventory Tracker'],
            'Variant Inventory Qty': data['Variant Inventory Qty'],
            'Variant Inventory Policy': data['Variant Inventory Policy'],
            'Variant Fulfillment Service': data['Variant Fulfillment Service'],
            'Variant Price': data['Variant Price'],
            'Variant Compare At Price': data['Variant Compare At Price'],
            'Variant Requires Shipping': data['Variant Requires Shipping'],
            'Variant Taxable': data['Variant Taxable'],
            'Variant Barcode': data['Variant Barcode'],
            'Image Src': data['Image Src'],
            'Image Position': data['Image Position'],
            'Image Alt Text': data['Image Alt Text'],
            'Gift Card': data['Gift Card'],
            'SEO Title': data['SEO Title'],
            'SEO Description': data['SEO Description'],
            'Variant Image': data['Variant Image'],
            'Status': data['Status'],
            'Collection': data['Collection']
        })

    def get_response(self, url):
        print('正在获取', url, '的数据')

        while True:
            try:
                response = requests.get(url, headers=self.headers)
                break
            except:
                print('没有请求到，重新请求')
                self.random_sleep()

        self.random_sleep()

        return response

    def get_html(self, url):
        response = self.get_response(url)
        return etree.HTML(response.text)

    def tab_run_js(self, tab, js_code):
        while True:
            try:
                tab.run_js(js_code)
                break
            except Exception as e:
                print('捕获tab_run_js方法的run_js:', e)
                tab.wait(1)

    def ele_click(self, tab, ele, timeout=2):
        while True:
            try:
                tab.actions.click(ele)
                break
            except Exception as e:
                print('捕获ele_click方法的actions.click:', e)
                tab.wait(timeout)

        tab.wait(timeout)

    def dp_click_ad(self, tab, xpath_txt):
        ad_ele = tab.ele(f'x:{xpath_txt}', timeout=2)
        if ad_ele:
            print('有广告:', ad_ele)
            self.ele_click(tab, ad_ele)
            self.tab_wait(tab, 1)

    def infinite_scroll(self, tab):
        product_url_list = []
        w_cnt = 0
        turn_cnt = 0
        is_bottom = False

        while True:
            self.dp_click_ad(tab, '//button[@class="css-2vqtum"]')
            self.dp_click_ad(tab, '//button[@class="css-71t821"]')

            if is_bottom:
                break

            tab.scroll.to_bottom()
            self.tab_wait(tab, 1)
            self.tab_run_js(self.tab, 'window.scrollBy(0, -1000)')

            turn_cnt += 1
            print(f'翻页了{turn_cnt}次')

            n_cnt = 0

            self.tab_wait(tab, 5)
            product_url_ele_list = tab.eles('x://div[@class="product-thumbnail plpv3 css-1gasjii"]//a[@class="css-avqw6d"]')

            for product_url_ele in product_url_ele_list:
                product_url = product_url_ele.attr('href')
                if 'https:' not in product_url:
                    product_url = 'https://www.coach.com' + product_url

                if product_url in product_url_list:
                    continue

                n_cnt += 1
                product_url_list.append(product_url)

            print('产品个数：', len(product_url_ele_list))
            print('获取到的产品个数', len(product_url_list))

            if n_cnt == 0:
w_cnt += 1
            else:
                w_cnt = 0

            if w_cnt >= 5:
                print('到底了')
                is_bottom = True

        return product_url_list

    def get_detail_tab(self, url, xpath_txt='html', backup_xpath_txt='html'):
        print('正在获取', url, '的数据')

        tab = self.browser.latest_tab

        while True:
            tab.get(url)
            self.tab_wait(tab)

            ele = tab.ele(f'x:{xpath_txt}')
            if ele:
                print('第1个xpath找到的')
                break
            else:
                t_ele = tab.ele(f'x:{backup_xpath_txt}')
                if t_ele:
                    print('第2个xpath找到的')
                    break

            print('没有请求到元素，重新请求中')
            tab.wait(2)

        return tab

    def get_product_img_url_list(self, tab, data):
        tab.wait(5)
        self.dp_click_ad(tab, '//button[@class="css-2vqtum"]')
        self.dp_click_ad(tab, '//button[@class="css-71t821"]')

        data['Option1 Value'] = tab.ele('x://p[contains(@class, "color-name")]').text

        product_color_img_url_list = []

        img_temp_ele = tab.ele('x://li[contains(@class, "is-prev")]')
        if img_temp_ele:
            product_color_img_url_cnt = int(img_temp_ele.attr('data-slide-index'))
        else:
            product_color_img_url_cnt = len(tab.eles('x://div[@class="css-1161qt5"]/div')) - 1

        print(data['Option1 Value'], '色的图片个数应为：', product_color_img_url_cnt)

        tab.wait(6)
        raw_product_color_img_ele_list = tab.eles('x://li[contains(@class, "splide__slide")]/div | //li[contains(@class, "splide__slide")]//video/source | //div[@class="css-1161qt5"]/div[@class="css-113el1s"]/div')

        for raw_product_color_img_ele in raw_product_color_img_ele_list:
            raw_product_color_img_url = raw_product_color_img_ele.attr('src')

            if not raw_product_color_img_url:
                continue

            raw_product_color_img_url = raw_product_color_img_url.split('?')[0]

            if raw_product_color_img_url in product_color_img_url_list:
                continue

            product_color_img_url_list.append(raw_product_color_img_url)

        print(data['Option1 Value'], '色获取到的图片个数为：', len(product_color_img_url_list))

        return product_color_img_url_list

    def init_tab(self):
        co = ChromiumOptions()
        co.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
        co.auto_port()
        # co.headless()
        # co.set_argument('--no-sandbox')

        self.browser = Chromium(co)

        self.tab = self.browser.latest_tab
        self.tab.set.window.max()

    def handle_img_url(self, raw_img_url):
        return raw_img_url

    def save_all_data(self, product_data_list):
        for product_data in product_data_list:
            self.cnt += 1
            self.save_csv(product_data)
            print('第', self.cnt, '行保存完成！')

    # 取消翻译
    def cancel_translate(self, url):
        tab = self.tab_get(self.tab, url, '//div[@class="collection_item overflow-hidden"]')
        tab.actions.move_to(tab.ele('x://div[@class="gt-translate-btn-bg"]'))
        tab.wait(2)
        tab.actions.click(tab.ele('x://div[@class="gt-translate-switch"]'))
        tab.wait(2)

    def click_accept(self, tab=None, url=None, xpath_txt='html'):
        if not tab and url:
            tab = self.get_detail_tab(url, xpath_txt=xpath_txt)

        temp_ele = tab.ele('x://div[contains(@class, "policy_acceptBtn")]')
        if temp_ele:
            self.ele_click(tab, temp_ele)

    def check_size(self, product_size_list):
        size_filter = []

        for product_size in product_size_list:
            if product_size in size_filter:
                return False

            size_filter.append(product_size)

        return True

    def test_product(self, url):
        self.file = open('./test.csv', 'w', newline='', encoding='utf-8-sig')
        self.writer = csv.DictWriter(self.file, fieldnames=self.field_names)
        self.writer.writeheader()

        logging.captureWarnings(True)

        self.init_tab()

        self.id += 1
        print(self.id, '开始')

        product_data_list = self.product_detail_parse(url, 'One')

        self.save_all_data(product_data_list)

        print(self.id, '结束')

        if self.file:
            self.file.close()

        if self.browser:
            self.browser.quit()

    def is_have_dot_line(self, tab, row, col):
        print('is_have_dot_line', row, col)
        return 'hmf-option-unavailable' in tab.ele(f'x://pdp-attributes-components-attributes[@class="hmf-mx-s hmf-mx-m-0"]/section/div[{str(row)}]//div[contains(@class, "hmf-flex-wrap")]/div[{str(col)}]/button[contains(@class, "hmf-selectable-option")]').attr('class')

    def get_product_color_url(self, tab, url, row, col):
        color_txt = tab.ele(f'x://pdp-attributes-components-attributes[@class="hmf-mx-s hmf-mx-m-0"]/section/div[{str(row)}]//div[contains(@class, "sliding-row-inner")]/div[{str(col)}]//button[contains(@class, "pdp-color-swatch-selected")]/img').attr('alt').strip().replace(' ', '/')

        return url.split('?')[0] + '?color=' + color_txt

    def dfs_click(self, tab, row, col, timeout=2):
        print('dfs_click', row, col)
        t_ele = tab.ele(f'x://pdp-attributes-components-attributes[@class="hmf-mx-s hmf-mx-m-0"]/section/div[{str(row)}]//div[contains(@class, "hmf-flex-wrap")]/div[{str(col)}]/button[contains(@class, "hmf-selectable-option")]')

        self.ele_click(tab, t_ele, timeout)

    def get_img_url(self, tab, url):
        product_img_url_list = []

        while True:
            flag_t = True

            while True:
                try:
                    tab.run_js('window.scrollTo(0, 1500);')
                    break
                except Exception as e:
                    print(e)

            self.tab_wait(self.tab, 1)

            cnt = 0
            all_img_cnt = 0
            click_text = ''

            while True:
                cnt += 1

                self.dp_click_ad(self.tab, '//i[@class="fa fa-times font-size-20"]')
                self.dp_click_ad(self.tab, '//button[@data-event="click/close"]')
                self.tab_wait(self.tab, 1)

                temp_ele = tab.ele('x://button[@class="hmf-button hmf-label-m medium-btn secondary-btn"]')
                print(temp_ele)

                if temp_ele:
                    click_text = tab.ele('x://button[@class="hmf-button hmf-label-m medium-btn secondary-btn"]//span[@class="hmf-button-container"]//span').text

                print('点击的文本：', click_text)

                if 'All' in click_text:
                    all_img_cnt = int(click_text.strip().split(' ')[-2])
                    self.tab.actions.click(temp_ele)
                    print('点击了加载图片按钮， 尝试点击的第', cnt, '下')
                elif 'Less' in click_text:
                    break

                if cnt >= 3:
                    if 'All' in click_text:
                        print('没点到扩展，重新请求')
                        flag_t = False
                        break
                    elif not temp_ele:
                        print('可能真没有扩展图片')
                        break

            if not flag_t:
                tab = self.tab_get(tab, url, '//div[@class="description-text"]') # 第4个请求

                continue

            self.tab_wait(self.tab, 6)

            while True:
                product_img_url_ele_list = tab.eles('x://pdp-product-image//picture//source[1]')

                for product_img_url_ele in product_img_url_ele_list:
                    try:
                        product_img_url_list.append(product_img_url_ele.attr('srcset'))
                    except:
                        try:
                            product_img_url_list.append(product_img_url_ele.attr('src'))
                        except Exception as e:
                            print('图片错误！')
                            print(e)
                            continue

                product_img_url_len = len(product_img_url_list)

                print('所有的图片个数应为：', all_img_cnt)
                print('获取到的图片个数为：', product_img_url_len)

                if cnt >= 3 or product_img_url_len == all_img_cnt:
                    break
                else:
                    print('没获取到所有的图片，重新获取中')
                    flag_t = False
                    break

            tab.run_js('window.scrollTo(0, -1500);')

            if product_img_url_len >= 5 or flag_t:
                break
            else:
                tab = self.tab_get(tab, url, '//div[@class="description-text"]') # 第5个请求

        return product_img_url_list

    def insert_data(self, data_list, img_url_list, data, tab, url, name_list, total_row, col_list):
        _data = deepcopy(data)
        self.sku_id += 1
        img_url_len = len(img_url_list)

        _data['Variant SKU'] = _data['Handle']

        for row in range(total_row):
            _data[f'Option{str(row + 1)} Value'] = tab.eles('x://p[@class="hmf-pt-xs"]//span[contains(@class, "hmf-header-xs")]')[row].text.strip()

            _data['Variant SKU'] =  _data['Variant SKU'] + '-' + _data[f'Option{str(row + 1)} Value'].lower().replace(' ', '-')

        _data['Variant SKU'] = _data['Variant SKU'] + '-' + str(self.sku_id)

        price_ele = tab.ele('x://div[@class="hmf-mb-xxxs hmf-header-bold-l"]/span')
        if price_ele:
            _data['Variant Price'] = price_ele.text.strip('$')
        else:
            try:
                _data['Variant Price'] = tab.ele('x://span[@class="product-price"]').text.strip('$')
                _data['Variant Compare At Price'] = tab.ele('x://span[@class="price-list"]/span').text.strip('$')
            except:
                try:
                    _data['Variant Price'] = tab.ele('x://span[@class="hmf-mb-xxxs hmf-header-bold-l price-list"]/span').text.strip('$')
                except:
                    _data['Variant Price'] = '999999'

        try:
            _data['Image Src'] = img_url_list[0]
        except:
            print('这个产品没有图片')
            return

        _data['Image Position'] = 1
        _data['Variant Image'] = _data['Image Src']

        print(f'第{self.id}个产品 {url} 的第{self.sku_id}个sku，', end='')
        for row in range(total_row):
            print(f'字段{name_list[row]}的个数为：{col_list[row]},', end='')

        print(f'图片个数为：{img_url_len}')

        data_list.append(_data)
        print(_data)

        for i in range(1, img_url_len):
            temp_data = deepcopy(self.empty_data)

            temp_data['Handle'] = _data['Handle']
            temp_data['Published'] = 'TRUE'
            temp_data['Image Src'] =img_url_list[i]
            temp_data['Image Position'] = i + 1

            data_list.append(temp_data)
            print(temp_data)

    def check_field_val_is_all_exist(self, tab):
        for path_id in range(len(self.row_path)):
            if 'Color' == self.row_path[path_id]:
                continue

            self.dfs_click(tab, self.row_path[path_id], self.col_path[path_id])

        field_val_ele_list = tab.eles('x://p[@class="hmf-pt-xs"]//span[contains(@class, "hmf-header-xs")]')
        for field_val_ele in field_val_ele_list:
            if not field_val_ele.text:
                print('字段组合不存在')
                return False

        return True

    def dfs_insert_data(self, data_list, data, img_url_list, tab, url, name_list, total_row, col_list, row):
        if row > total_row:
            return

        t_url = url
        for col in range(1, col_list[row - 1] + 1):
            if 1 == row and ('Color' in name_list[row - 1]):
                color_url = self.get_product_color_url(tab, url, row, col)
                t_url = color_url

                tab = self.tab_get(tab, color_url, '//div[@class="description-text"]') # 第3个请求

                img_url_list = self.get_img_url(tab, t_url)
                self.img_flag = False

                self.row_path.append('Color')
                self.col_path.append('Color')
            else:
                if self.img_flag:
                    img_url_list = self.get_img_url(tab, t_url)
                    self.img_flag = False

                self.dp_click_ad(tab, '//button[@class="css-2vqtum"]')
                self.dp_click_ad(tab, '//button[@class="css-71t821"]')

                self.row_path.append(row)
                self.col_path.append(col)

            if 1 == row:
                data['Title'] = self.tab.ele('x://h1[@class="hmf-header-bold-m hmf-header-bold-m-l"]').text.strip()
                data['Handle'] = data['Title'].lower().replace(' ', '-') + '-' + data['Type'].lower().replace(' ', '-')
                data['Body (HTML)'] = self.tab.ele('x://div[@class="description-text"]').html

            if row == total_row:
                if self.check_field_val_is_all_exist(tab):
                    self.insert_data(data_list, img_url_list, data, tab, t_url, name_list, total_row, col_list)

            self.dfs_insert_data(data_list, data, img_url_list, tab, url, name_list, total_row, col_list, row + 1)

            self.row_path.pop()
            self.col_path.pop()

    def check_field_val_num(self, field_num, field_val_num_list):
        for field_id in range(3, field_num):
            if field_val_num_list[field_id] > 1:
                return False

        return True

    def product_detail_parse(self, url, _type):
        product_data_list = []

        data = deepcopy(self.init_data)
        data['Type'] = _type
        data['Collection'] = data['Type']

        tab = self.tab_get(self.tab, url, '//div[@class="description-text"]') # 第2个请求

        # ------------------------------------
        # 获取字段的名称 和 字段的数量 和 每个字段拥有的值的数量

        field_name_list = []
        field_val_num_list = []

        field_name_ele_list = tab.eles('x://p[@class="hmf-pt-xs"]//span[contains(@class, "hmf-header-bold-xs")]')
        field_num = len(field_name_ele_list)

        for field_i in range(field_num):
            data[f'Option{str(field_i + 1)} Name'] = field_name_ele_list[field_i].text.replace(':', '').strip()
            field_name_list.append(data[f'Option{str(field_i + 1)} Name'])

            if 'Color' in data[f'Option{str(field_i + 1)} Name']:
                field_val_ele_len = len(tab.eles(f'x://section[.//p[@class="hmf-pt-xs"]//span[contains(@class, "hmf-header-bold-xs")]]//button[contains(@class, "pdp-color-swatch-selected")]'))
            else:
                field_val_ele_len = len(tab.eles(f'x://pdp-attributes-components-attributes[@class="hmf-mx-s hmf-mx-m-0"]/section/div[{str(field_i + 1)}]//div[contains(@class, "hmf-flex-wrap")]/div/button[contains(@class, "hmf-selectable-option")]/span'))
            field_val_num_list.append(field_val_ele_len)

        # ------------------------------------

        print(f'有{field_num}个字段')
        print('字段的名称有：', field_name_list)
        for field_i in range(field_num):
            print(f'{field_name_list[field_i]}拥有的值的数量为：{field_val_num_list[field_i]}')

        if field_num > 3:
            if not self.check_field_val_num(field_num, field_val_num_list):
                return product_data_list

        self.img_flag = True
        self.sku_id = 0
        self.dfs_insert_data(product_data_list, data, [], tab, url, field_name_list, field_num, field_val_num_list, 1)

        return product_data_list

    def parse(self):
        page = 0

        self.get_dp_html(self.tab, self.url, '//div[contains(@class, "pro-tips-carousel-item")]')  # 第1个请求

        while True:
            index_html = etree.HTML(self.tab.html)

            is_next = False
            if len(index_html.xpath('//button[@title="Next Page"]')) > 0:
                is_next = True

            print('是否有下一页:', is_next)

            product_url_list = index_html.xpath('//div[@class="product-card-image hmf-mt-xxs hmf-mb-xs"]/a[@class="image"]/@href')

            for product_url in product_url_list:
                self.id += 1

                if 'https:' not in product_url:
                    product_url = 'https://www.dickssportinggoods.com' + product_url

                product_url = product_url.replace(' ', '/')

                with open('./filter.txt', 'r', encoding='utf-8') as f:
                    filter_txt = f.read()

                if product_url in filter_txt:
                    print(self.id, '已完成')
                    continue

                print(self.id, '开始')

                product_data_list = self.product_detail_parse(product_url, 'Fishing Rods, Reels & Combos')

                self.save_all_data(product_data_list)

                print(self.id, '结束')

                with open('./filter.txt', 'a', encoding='utf-8') as f:
                    f.write(product_url + '\n')

            if is_next:
                self.get_dp_html(self.tab, self.url, '//div[contains(@class, "pro-tips-carousel-item")]')

                page += 1
                for i in range(page):
                    self.tab.scroll.to_bottom()
                    self.tab.wait(2)
                    self.tab_run_js(self.tab, 'window.scrollBy(0, -2500)')
                    self.tab.wait(2)
                    self.ele_click(self.tab, self.tab.ele('x://button[@title="Next Page"]'))
                    self.tab.wait(3)
            else:
                break

    def run(self, is_continue=False):
        if is_continue:
            self.file = open('./dickssportinggoods_8_fishing_rods.csv', 'a', newline='', encoding='utf-8-sig')
            self.writer = csv.DictWriter(self.file, fieldnames=self.field_names)
        else:
            self.file = open('./dickssportinggoods_8_fishing_rods.csv', 'w', newline='', encoding='utf-8-sig')
            self.writer = csv.DictWriter(self.file, fieldnames=self.field_names)
            self.writer.writeheader()

            with open('./filter.txt', 'w', encoding='utf-8') as f:
                f.write('')

        self.init_tab()

logging.captureWarnings(True)

        self.parse()

        if self.file:
            self.file.close()

        if self.browser:
            self.browser.quit()

if __name__ == '__main__':
    dickssportinggoods = Dickssportinggoods()
    dickssportinggoods.run()

    url1 = 'https://www.dickssportinggoods.com/p/ugly-stik-bigwater-spinning-combo-22ustuglybwspn50scom/22ustuglybwspn50scom' # 4个字段 Rod Length 和 Reel Size 是对应选择，不同价格不同
    url2 = 'https://www.dickssportinggoods.com/p/daiwa-x2-samurai-spinning-combo-24daiusmrx2407fmhcom/24daiusmrx2407fmhcom' # 4个字段 Rod Length 和 Power 是对应选择，不同价格相同，并且有打折价格
    url3 = 'https://www.dickssportinggoods.com/p/ugly-stik-catfish-spinning-combo-15sksus50szsp7f2pcom/15sksus50szsp7f2pcom' # 3 个字段 Rod Length 、Power 、Pieces，且只有一个图片
    url4 = 'https://www.dickssportinggoods.com/p/profishiency-krazy-rizz-baitcast-combo-24fvnukrzyrzz7ft2com/24fvnukrzyrzz7ft2com' # 4个字段，Retrieve选择不同，价格相同
    url5 = 'https://www.dickssportinggoods.com/p/zebco-roam-spincast-combo-19zeburmgrn602mspcom/19zeburmgrn602mspcom?color=Green' # 3个字段，有颜色
    url6 = 'https://www.dickssportinggoods.com/p/zebco-roam-baitcasting-combo-22zeburm6f6mh2prhcom/22zeburm6f6mh2prhcom?color=Green' # 3个字段，有颜色，有个不同的字段Gear Ratio
    url7 = 'https://www.dickssportinggoods.com/p/lews-crappie-thunder-jig-troll-spinning-combo-23lewucrppthndr12com/23lewucrppthndr12com' # 4个字段，Rod Length不同价格不同
    url8 = 'https://www.dickssportinggoods.com/p/zebco-wonder-woman-50-spinning-combo-25zebuwndrwmn502mcom/25zebuwndrwmn502mcom' # 一个字段
    url9 = 'https://www.dickssportinggoods.com/p/penn-fierce-iv-spinning-combo-22pnfufrcv80007ftcom/22pnfufrcv80007ftcom' # 5个字段
    url10 = 'https://www.dickssportinggoods.com/p/zebco-roam-spincast-combo-19zeburmgrn602mspcom/19zeburmgrn602mspcom?color=Orange'
    # dickssportinggoods.test_product(url1)

# https://www.dickssportinggoods.com/f/rods-reels-and-combos?filterFacets=5382%253ARod%2520%2526%2520Reel%2520Combos

# 上传代码的时候把True给去掉 √
# 试试全部产品数据 √
# 试试部分产品数据 √

# Rod Length    Color（决定图片）
# Reel Size     Power
# Retrieve      Action      Gear Ratio
# Pieces

# 这是5个字段的，不用改掉，没事
搜索结果