本文共 5396 字,大约阅读时间需要 17 分钟。
from selenium import webdriver # 自动化爬取工具库import time # 让程序休眠一段时间的库from lxml import etree # lxml解析库import mysql.connector # Python和数据库连接库import random # 生成随机数的库from typing import NoReturn, Tuple, List # 类型标记库class JD_example(): """爬取某东的类""" def __init__(self, table_name: str) -> NoReturn: """初始化 table_name:在MySQL中要创建并用来存储爬取数据的表的名称 """ self.browser = webdriver.Chrome() # 初始化一个浏览器对象 self.browser.get('https://www.jd.com/') # 打开网址 self.table_name = table_name # 实例属性,MySQL中的表名 self.start_id = 1 # 实例属性,插入数据库中数据的起始ID self.conn = mysql.connector.connect( # 构建和MySQL的连接 host='localhost', user='root', passwd='123456', port=3307, charset='utf8', database='reptile', ) self.mycursor = self.conn.cursor() # 构建油标 def append_to_database(self) -> NoReturn: """将数据新加到表中""" self.start_id = self.get_DBcount() + 1 def get_DBcount(self) -> int: """获取表中的数据的条数""" self.mycursor.execute("select count(*) from {};".format(self.table_name)) result = self.mycursor.fetchone()[0] return result def set_product(self, keyword: str) -> NoReturn: """输入要搜索的商品的名称并点击搜索按钮 keyword:要搜索的商品的名称 """ self.browser.implicitly_wait(5) # 设置隐式等待的时间 # 找到搜索框并输入商品名称 self.browser.find_element_by_css_selector('#key').send_keys(keyword) time.sleep(random.choice([1, 1.4, 1.5, 2])) # 休眠一段随机时间 # 找到搜索按钮并点击 self.browser.find_element_by_css_selector('.button').click() self.browser.maximize_window() # 窗口最大化 def clear_search_box(self) -> NoReturn: """清空搜索框中的内容""" self.browser.find_element_by_css_selector('#key').clear() time.sleep(random.random()) def drop_down(self) -> NoReturn: """向下滑动页面""" for x in range(0, 11): # 向下滑动一段随机距离,重复11次 time.sleep(2) js = str('window.scrollBy(0,{})'.format(x * random.randint(70, 95))) self.browser.execute_script(js) # 下滑至下一页按钮出现在页面的最下方 self.browser.execute_script("document.querySelector('a.pn-next').scrollIntoView(false)") time.sleep(random.randint(1, 3)) def parse_data(self) -> NoReturn: """获取页面数据""" html = self.browser.page_source # 获取页面源码 e = etree.HTML(html) # 初始化xpath li_list = e.xpath('//*[@id="J_goodsList"]/ul/li') # 选取目标li标签,生成一个li标签列表 # 每一个li标签对应一个商品,标签中包含商品的价格,名称,评价等信息,下面就是遍历每一个li标签 # 获取其中的商品信息 res = [] for li in li_list: try: price = float(''.join(li.xpath('./div/div[3]/strong/i/text()')).strip().replace(' ', '')) name = li.xpath('string(./div/div[contains(@class,"name")]/a/em)').strip().replace('\n', '') evaluate = li.xpath('string(./div/div[contains(@class,"commit")]/strong)').strip().replace('\n', '') tem = tuple((self.start_id, price, name, evaluate)) res.append(tem) print(self.start_id, price, name, evaluate, sep='|') self.start_id += 1 except: continue return tuple(res) def click_next_page(self) -> NoReturn: """点击下一页""" self.browser.find_element_by_xpath('//a[@class="pn-next"]').click() def create_table(self) -> NoReturn: """在mysql中创建一个名为输入的表名的表""" self.mycursor.execute("create table if not exists { }(\ ID int ,\ price decimal(10, 2),\ product_name varchar(200),\ evaluate varchar(20),\ primary key (ID)\ ) ENGINE=INNODB DEFAULT CHARSET='utf8mb4' COLLATE='utf8mb4_unicode_ci'".format(self.table_name) ) def insert_into_mysql(self, goods_tuple: Tuple[Tuple]) -> NoReturn: """将数据插入到表中 goods_tuple:一个元组,元组的每一个元素也是元组,其中包含一个商品的id,价格,评价数量 """ sql_insert = "insert into {} values (%s, %s, %s, %s)".format(self.table_name) self.mycursor.executemany(sql_insert, goods_tuple) self.conn.commit() def start(self, goods_lists: List[str], page_numbers: int) -> NoReturn: """开始爬取 goods_lists:要爬取的商品的列表 page_numbers:每个商品要爬取的页数 """ self.create_table() for good in goods_lists: self.set_product(good) self.append_to_database() for i in range(0, page_numbers): self.drop_down() time.sleep(5) goods_tuple = self.parse_data() self.insert_into_mysql(goods_tuple) self.click_next_page() self.clear_search_box() self.mycursor.close() self.conn.close() self.browser.close()if __name__ == '__main__': """程序入口""" jd1 = JD_example('jd0') goods_lists = ['华为手机', 'ViVo手机', 'OPPO手机'] page_numbers = 1 jd1.start(goods_lists, page_numbers)
转载地址:http://wxjcz.baihongyu.com/