python百度收录批量查询

python版本 3.8
安装以下插件

pip3 install requests
pip3 install json
pip3 install lxml
pip3 install selenium

然后到到https://chromedriver.storage.googleapis.com/index.html 下载对应版本谷歌
浏览器驱动放到本目录覆蓋chromedriver.exe。
如果不知道版本，执行一下 python main.py
github地址
https://github.com/linwoodpendleton/baidusoulu
如下图：

原码：
code:
import json import random


import requests

import time

from random import randint

from lxml import etree

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By

from selenium.webdriver.chrome.service import Service

chrome_options = Options()

cookies_str = 'BDORZ=xxxooo; BA_HECTOR=xxxooo; BDRCVFR[xxxoo]=xxxooo; H_PS_PSSID=xxxooo; H_WISE_SIDS=xxxooo'  #百度cookies直接浏览器复制过来即可

chrome_options.add_argument("--start-maximized")

s = Service('chromedriver.exe')  #谷歌浏览器驱动，到https://chromedriver.storage.googleapis.com/index.html下载对应版本

driver = webdriver.Chrome(service=s, options=chrome_options)

driver.implicitly_wait(10)

driver.get('http://www.baidu.com')

def cookies_to_dict(cookie):

cookies = cookie.split("; ")

for co in cookies:

cookieDict = {}

co = co.strip()

p = co.split('=')

value = co.replace(p[0] + '=', '').replace('"', '')

cookieDict["name"] = p[0]

cookieDict["value"] = value

driver.add_cookie(cookieDict)
def check_index_number(url):

"""

查询网址被百度收录的数量

:param url: 要查询的网址

:return: 返回被百度收录的数量

"""

url_a = 'https://www.baidu.com/s?wd=site%3A'

url_b = '&pn=1'

joinUrl = url_a + url + url_b

try:

cookies_to_dict(cookies_str)

driver.implicitly_wait(10)

driver.get(joinUrl)  # 获取网页

except:

return ' '

try:

index_number = driver.find_element(By.XPATH,'//[@id="1"]/div/div[1]/div/p[3]/span/b').text

try:

yanzen = driver.find_element(By.XPATH,'//[@id="pass-slide-tipInfo62"]').text

except:

yanzen = None

except:

index_number = 0

yanzen = ''

pass

return index_number,yanzen
def getUrl(filepath):

with open(filepath, "r") as f:

f = f.readlines()

return f
def get_hostloc(url):

cookies_to_dict(cookies_str)

try:
    driver.get(url)  # 获取网页
    driver.implicitly_wait(10)
except:
    return ' '
try:
    yanzen = driver.find_element(By.XPATH, '/html/body/div[4]/div[1]/div/p[1]')
except:
    yanzen = None
page_source = driver.find_element(By.XPATH,"//body").text
return page_source,yanzen

def isindex(link):

link = link.replace("http://", "")

link = link.replace("https://", "").replace("/", "%2F")

url = link

url = "http://www.baidu.com/s?wd=" + url

html,yanzen = get_hostloc(url)

with open("result.txt", 'a') as f:

if "没有找到与" in html or "没有找到该URL" in html or html=='':

if yanzen:

print(link, "出现验证码")

else:

print(link, "未收录")

else:

if  yanzen:

print(link, "出现验证码")

else:

print(link, "收录")

indexed_number, yanzen = check_index_number(link)

if yanzen:

print(link, "出现验证码")

f.write(link + '\t出现验证码\n')

else:

f.write(link + '\t' + str(indexed_number) + '\n')
def main():

filepath = "url.txt"  # 待查询的URL链接文本，一行一个

urls = getUrl(filepath)

for url in urls:

url = url.strip()

isindex(url)

if name == 'main': main()

南岭笑笑生

python百度收录批量查询

发布时间

标签

阅读时间

相关阅读