1、使用read_html()读取静态页面的table
静态页面是指不需要浏览器解析生成,直接获取的html页面的源码,例如,
import pandas as pd url_mcc = "https://baike.baidu.com/item/%E7%A7%BB%E5%8A%A8%E7%BD%91%E7%BB%9C%E4%BB%A3%E7%A0%81/5935540?fr=aladdin" dfs = pd.read_html( url_mcc, match="GSM 900 / GSM 1800 / UMTS 2100", header=0, converters={"MNC": str}, ) dfs[0].to_excel('~/cjavapy.xlsx', index=False)
相关文档:
Python pandas.DataFrame.to_excel函数方法的使用
Python Pandas pandas.read_html函数方法的使用
2、使用Selenium获取需要浏览器解析的html
参考文档:Python Selenium ChromeDriver 获取指定标签元素内的html
# -*- encoding: utf-8 -*- # Created on 2022-02-20 15:37:50 # Project: read_html() from distutils.file_util import write_file from selenium.webdriver.chrome.service import Service from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys import time import pandas as pd chrome_options = Options() chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en') chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) c_service = Service("/usr/bin/chromedriver")#指定下载好的chromedriver的路径 c_service.command_line_args() c_service.start() web = webdriver.Chrome(options=chrome_options) web.get('https://baike.baidu.com/item/%E7%A7%BB%E5%8A%A8%E7%BD%91%E7%BB%9C%E4%BB%A3%E7%A0%81/5935540?fr=aladdin') time.sleep(2) element = web.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]') page_html = element.get_attribute('innerHTML') print(page_html) dfs = pd.read_html( page_html, match="GSM 900 / GSM 1800 / UMTS 2100", header=0, converters={"MNC": str}, ) dfs[0].to_excel('~/cjavapy.xlsx', index=False)