本文章是以python3写的一个获取代码的案例。
from tqdm import tqdm
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, WebDriverException
import pandas as pd
import numpy as np
import os
position = ["北京","天津","上海","重庆",
"河北","山西","辽宁","吉林",
"福建","江西","山东","河南",
"湖北","湖南","广东","海南",
"四川","贵州","云南","陕西",
"甘肃","青海","台湾","内蒙古",
"广西","西藏","宁夏","新疆",
"香港","澳门"
]
position = ['北京']
name,level,hot,address,num=[],[],[],[],[]
def get_one_page(key,page):
try:
option_chrome = webdriver.ChromeOptions()
option_chrome.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=option_chrome)
time.sleep(5)
url="http://piao.qunar.com/ticket/list.htm?keyword="+str(key)+"®ion=&from=mpl_search_suggest&page="+str(page)
driver.get(url)
infor = driver.find_elements_by_class_name("sight_item")
for i in range(len(infor)):
#景点名字
name.append(infor[i].find_element_by_class_name("name").text)
#获取景点评级
try:
level.append(infor[i].find_element_by_class_name("level").text)
except:
level.append("")
#获取景点热度
hot.append(infor[i].find_element_by_class_name("product_star_level").text[3:])
#获取景点地址
address.append(infor[i].find_element_by_class_name("area").text)
#huo qu jing dian xiao liang
try:
num.append(infor[i].find_element_by_class_name("hot_num").text)
except:
num.append(0)
driver.quit()
return
except TimeoutException or WebDriverException:
return get_one_page()
for key in tqdm(position):
print("正在爬取{}".format(key))
for page in range(1,14):
print("正在爬取第{}页".format(page))
get_one_page(key,page)
sight = {'name': name, 'level': level, 'hot': hot, 'address': address, 'num':num}
sight = pd.DataFrame(sight, columns=['name', 'level', 'hot', 'address', 'num'])
sight.to_csv("sight.csv",encoding="utf_8_sig")
在这个代码需要下载chromedriver.exe。下载地址是:
http://chromedriver.chromium.org/downloads
本案例获取的数据是以.csv格式保存到本地。
发表回复