最近都在研究怎么做快排,分享下成果,可以一起学习交流!
附上代码:
- # -*- coding: utf-8 -*-from selenium import webdriver
- import time
- import requests
- import random
- import os
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import traceback
- import urllib.request
- import pymysql
- import socket
- #import win32api #pip install pypiwin32
-
- #from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- #DesiredCapabilities.INTERNETEXPLORER['ignoreProtectedModeSettings'] = True
-
-
-
- #rasdial 宽带连接 19ab68----643534
- def connect():
- cmd_str = "rasdial %s %s %s" % (g_adsl_account['name'], g_adsl_account['username'], g_adsl_account['password'])
- os.system(cmd_str)
- time.sleep(5)
-
-
- #"rasdial 断开宽带连接 /disconnect"
- def disconnect():
- cmd_str = "rasdial %s /disconnect" % g_adsl_account['name']
- os.system(cmd_str)
- time.sleep(5)
-
- #获取ip地址
- def get_ip():
- #return ['ip','address']
- fp = urllib.request.urlopen("http://ip.chinaz.com/getip.aspx")
- mybytes = fp.read()
- # note that Python3 does not read the html code as string
- # but as html code bytearray, convert to string with
- mystr = mybytes.decode("utf8")
- fp.close()
- ip = mystr.find("ip")
- add = mystr.find("address")
- ip = mystr[ip+4:add-2]
- address = mystr[add+9:-2]
- return [ip,address]
-
- #将ip地址插入数据库
- def insert_db(ipdate):
- #try:
- #获取一个数据库连接,注意如果是UTF-8类型的,需要制定数据库
- conn=pymysql.connect(host='localhost',user='root',passwd='',port=3306,charset='utf8')
- cur=conn.cursor() #获取一个游标对象
- #cur.execute("CREATE DATABASE zongzong") #执行对应的SQL语句
- #exit()
- cur.execute("USE zongzong")
- #exit()
- #cur.execute("CREATE TABLE `ip_log` (`id` int(11) NOT NULL AUTO_INCREMENT,`ip` varchar(32) DEFAULT NULL,`address` varchar(64) DEFAULT NULL,`keyword` varchar(64) DEFAULT '',`url` varchar(256) DEFAULT '',`error` varchar(64) DEFAULT '',`created_at` timestamp NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8;")
-
- #插入数据
- ISOTIMEFORMAT='%Y-%m-%d %X'
- ipdate.append( time.strftime( ISOTIMEFORMAT, time.localtime() ))
- cur.execute("INSERT INTO ip_log(ip,address,keyword,url,error,page,rank,created_at) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)",ipdate)
-
- #cur.execute("SELECT * FROM ip_log")
- #data=cur.fetchall()
- #print(data)
-
- cur.close()#关闭游标
- conn.commit()#向数据库中提交任何未解决的事务,对不支持事务的数据库不进行任何操作
- conn.close()#关闭到数据库的连接,释放数据库资源
- #except:
- # print("发生异常")
-
-
- #获取搜素出来的url
- def get_search_url(driver):
- urls = []
- real = []
- real_url = []
- click_link = []
- content = driver.find_element_by_css_selector("div[id=\"content_left\"]")
- links = content.find_elements_by_tag_name("a")
- for link in links:
- if link.get_attribute('class') == "c-showurl":
- real.append(link.text)
- url = link.get_attribute('href')
- urls.append(url)
-
- #解密url
- header = requests.head(url).headers
- is_append = True
- for out_url in out_urls:
- if out_url in header['location']:
- is_append = False
- break
-
- if is_append == True:
- real_url.append(header['location'])
- #a标签对象
- click_link.append(link)
-
- #print(real)
- #print(urls)
- #return urls
- return [real_url,click_link]
-
-
- #function:解析加密url,剔除竞争对手的url
- # def get_real_url(urls):
- # real_url = []
- # for url in urls:
- # header = requests.head(url).headers
- # is_append = True
- # for out_url in out_urls:
- # if out_url in header['location']:
- # is_append = False
- # break
-
- # if is_append == True:
- # real_url.append(header['location'])
- # return real_url
-
- #function 目标地址是否在某个list中
- def get_urlIndex(tagurl,urls):
- i = 0
- has = -1
- for url in urls:
- if tagurl in url:
- has = True
- return i
- i = i+1
- return has
-
-
- #点击百度搜索内容下面的下一页
- def click_nextBtn(driver):
- div = driver.find_element_by_css_selector("div[id=\"page\"]")
- a = div.find_elements_by_tag_name("a")
- for item in a:
- print(item.text)
- if item.text == "下一页>":
- item.click()
-
- return driver
-
-
-
-
- #随机点击
- def click_search_url(driver,items):
- urls = []
- real = []
- content = driver.find_element_by_css_selector("div[id=\"content_left\"]")
- links = content.find_elements_by_tag_name("a")
- i=0
- '''获取当前窗口'''
- nowhandle = driver.current_window_handle
- #allhandles=driver.window_handles
- #for handle in allhandles:
- # print('....当前窗口....',handle.title)
- #exit()
-
- for link in links:
- if link.get_attribute('class') == "c-showurl":
- if i in items:
- print("随机点击item:",i)
- print(link.get_attribute('href'),link.text)
- #exit()
- link.click()
- #停留在点击页面
- time.sleep(random.randint(5,10))
-
- '''获取所有窗口'''
- allhandles=driver.window_handles
- #for handle in allhandles:
- # print('....当前窗口....',handle.title)
- #exit()
-
- '''循环判断窗口是否为当前窗口'''
- for handle in allhandles:
- if handle != nowhandle:
- print("切换到当前窗口")
- driver.switch_to_window(handle)
- print("title:",driver.title)
- '''关闭当前窗口'''
- driver.close()
- '''回到原先的窗口'''
- print("切换到原来的窗口")
- driver.switch_to_window(nowhandle)
- print("title:",driver.title)
- print("本次随机点击完毕!")
-
- i=i+1
-
-
- #获取随机点击的搜索页random.randint(0
- def get_random_index(index,len):
- if index >= 8:
- random_index = [
- random.randint(0,4),random.randint(5,8)
- ]
- elif index>=4:
- random_index = [
- random.randint(0,3),random.randint(3,index)
- ]
- elif index>=0:
- random_index = [
- index
- ]
- elif index == -1:
- if len <=5:
- random_index = [
- random.randint(0,5)
- ]
- else:
- random_index = [
- #random.randint(0,4),random.randint(5,len)
- random.randint(5,len)
- ]
- return random_index
-
-
-
- def getUA():
- uaList = [
- #360
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
- #chrome
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
- #"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
- "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
-
- #firefox
- #"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
- "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0",
-
- #ie11
- #"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
- #ie8
- #"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357)",
-
- #2345王牌
- #"Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018",
-
- #搜狗
- #"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
- #opera
- "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"
-
- ]
- headers = random.choice(uaList)
- return headers
-
- #屏幕浏览器窗口大小
- def getWindowSize():
- wind_size = [
- [1920,1080],
- [1600,900],
- [1280,720]
- ]
- headers = random.choice(wind_size)
- return headers
-
-
- #屏幕分辨率设置
- def setDisplay():
- display_size = [
- [1920,1080],
- [1680,1050],
- [1600,900],
- [1440,900],
- [1400,1050]
- ]
- d_size = random.choice(display_size)
-
- dm = win32api.EnumDisplaySettings(None, 0)
- dm.PelsWidth = d_size[0]
- dm.PelsHeight = d_size[1]
- dm.BitsPerPel = 32
- dm.DisplayFixedOutput = 0
- win32api.ChangeDisplaySettings(dm, 0)
-
-
- #拨号 19ab68----643534
- g_adsl_account = {
- "name":"宽带连接",
- "username":"19ab68",
- "password":"643534"
- }
-
-
- #屏蔽点击的地址(竞争对手)
- out_urls = [
- 'zhimo.yuanzhumuban.cc',
- 'bbs.yuanzhumuban.cc',
- 'http://money.163.com/15/0416/11/ANANRECC00253B0H.html'
- ]
-
-
- ##内页词
- targetURL = [
-
- ['http://www.hkuws.com','注册离岸公司'],
- ['zs.efu.com.cn/mornfeeit/','梦菲雪'],
- ['zs.efu.com.cn/chengshijiaren/','城市佳人'],
- ['www.kidsnet.cn/exposition','童装展会'],
- #['top.kidsnet.cn/','童装加盟排行榜'],
- #['www.nynet.com.cn/','内衣网'],
- #['www.nzw.cn/','女装网'],
- ['zs.efu.com.cn/ks/','卡索'],
- ['zs.efu.com.cn/distin-kidny/','迪斯廷凯'],
- ['zs.efu.com.cn/fuzhuang/luyidigao/','路易迪高童装代{过}{滤}理'],
- ['brand.efu.com.cn/brandshow-1221090.html','凯帝龙驰'],
- ['zs.efu.com.cn/rabbitjero/','兔子杰罗'],
- ['zs.efu.com.cn/wmprince/','西瓜王子'],
- ['zs.efu.com.cn/betu','百图'],
- ['zs.efu.com.cn/pepco/','小猪班纳'],
-
-
- #['http://news.ifeng.com/a/20160518/48795120_0.shtml','华夏信财'],
- ['http://weibo.com/huaxiafinance','华夏信财'],
- ['http://p2p.hexun.com/2016-04-26/183531215.html','华夏信财'],
- #['http://news.xinhuanet.com/fortune/2016-04/26/c_128932834.htm','华夏信财'],
- ['http://www.xcf.cn/gdyw/201605/t20160526_772682.htm','华夏信财'],
- ['http://www.huaxiaoxia.com/','华夏信财'],
- #['https://lc.huaxiafinance.com/','华夏信财'],
-
-
-
- ['so.tedu.cn','网络营销培训机构'],
- ['www.cosatto.net.cn','个性安全座椅'],
- ['www.kaihuata.com/','开化旅游'],
- #['www.kaihuata.com/','开化'],
-
- ]
-
-
- for targetInfo in targetURL:
- try:
- #更换ip
- disconnect()
- connect()
-
- while(1):
- try:
- socket.gethostbyname("baidu.com")
- break;
- except:
- disconnect()
- connect()
- #更换分辨率
- #setDisplay()
-
-
- #启动浏览器
- #driver = webdriver.Ie()
- #driver = webdriver.Chrome()
- #driver = webdriver.Firefox()
-
- #设置PhantomJS的user_agent
- dcap = dict(DesiredCapabilities.PHANTOMJS)
- user_agent = getUA()
- print(user_agent)
- dcap["phantomjs.page.settings.userAgent"] = (
- user_agent
- )
- #dcap["phantomjs.page.settings.resourceTimeout"] = (15000)
- dcap["phantomjs.page.settings.loadImages"] = (False)
- driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--load-images=no'])
-
-
- # UA = getUA()
- # print(UA)
- # webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.User-Agent'] = UA
- # driver = webdriver.PhantomJS()
-
- driver.implicitly_wait(30)
-
- #清cookie
- driver.delete_all_cookies()
-
- #driver.maximize_window() # 浏览器全屏显示
-
- #打开百度
- driver.get("http://www.baidu.com/")
- #driver.get("http://mch.weiba01.com/2.php")
-
- #设置浏览器窗口大小
- window_size = getWindowSize()
- driver.set_window_size(window_size[0], window_size[1])
-
-
- #搜索某个关键词
- print('打开百度成功',driver.title)
- target = targetInfo[0]
- keyword = targetInfo[1]
- if len(targetInfo)>2:
- error_keyword = targetInfo[random.randint(2,len(targetInfo)-1)]
- print(">>>>>>>>>>>>>>>点击的关键词:",keyword,"--->目标地址:",target,">>>>>>>>>>>>>>>>>>>>")
-
-
- if len(targetInfo)>2:
- #模拟错误关键词
- print("点击错误关键词:",error_keyword);
- driver.find_element_by_id("kw").send_keys(error_keyword)
- time.sleep(2)
- driver.find_element_by_id("su").click()
- time.sleep(5)
- driver.find_element_by_id("kw").clear()
- time.sleep(2)
- print("错误关键词点击完毕")
-
- driver.find_element_by_id("kw").send_keys(keyword)
- #time.sleep(2)
-
- #点击搜索按钮
- print("...开始点击搜索按钮..")
- driver.find_element_by_id("su").click()
- #exit()
- print("...点击完毕..")
- time.sleep(2)
-
-
- #获取搜索结果页 0:着陆页 1:对应的链接对象
- urls_res = get_search_url(driver)
- real_urls = urls_res[0]
- #get_search_url(driver)[1][2].click()
-
-
- #real_urls = get_real_url(urls)
- print("搜索出来的可点击着陆页个数:",len(real_urls))
- print(real_urls)
- index = get_urlIndex(target,real_urls)
- print("目标index:",index)
-
- page = 1
- while index == -1 and page <= 4:
- if page == 1:
- #点击前面的几个着陆页,模拟用户真实行为
- items = get_random_index(index,len(real_urls))
- #items = [4]
- print(items)
- click_search_url(driver,items)
-
- #下一页
- driver = click_nextBtn(driver)
- time.sleep(3)
- urls_res = get_search_url(driver)
- real_urls = urls_res[0]
- #real_urls = get_real_url(urls)
- print(real_urls)
- index = get_urlIndex(target,real_urls)
-
- page = page+1
-
-
-
- if index > 4 and page == 1:
- #第一页,随机点击两个或一个
- int = random.randint(1,2)
- if int == 2:
- items = get_random_index(index,len(real_urls))
- else:
- items = [1]
- print(items)
- click_search_url(driver,items)
-
- if page >=5:
- print("没有找到目标地址,放弃搜索...")
- print("关闭浏览器")
- driver.quit()
-
- time.sleep(5)
- data = get_ip()
- data.append(keyword)
- data.append(target)
- data.append("no_find")
- data.append(-1)
- data.append(-1)
- insert_db(data)
- continue
-
- print("目标在page",page,"当前排名:",index,real_urls[index])
- print("反问最后的目标页...")
- #driver.get(real_urls[index])
- urls_res[1][index].click()
- time.sleep(5)
-
- nowhandle = driver.current_window_handle
- allhandles = driver.window_handles
- #目标页和搜索栏目页切换下
- for handle in allhandles:
- if handle != nowhandle:
- print("切换到当前窗口")
- driver.switch_to_window(handle)
- stime = random.randint(15,25)
- #stime = 5;
- print("目标页title:",driver.title,"停留-->",stime)
- time.sleep(stime)
- '''关闭当前窗口'''
- driver.close()
-
- '''回到原先的窗口'''
- print("切换到原来的窗口")
- driver.switch_to_window(nowhandle)
- print("title:",driver.title)
-
-
- #time.sleep(random.randint(40,60))
- #time.sleep(5)
-
- #清除所有cookie
- print("打印cookie")
- cookie= driver.get_cookies()
- print(cookie)
- print("清除cookie")
- driver.delete_all_cookies()
- print("打印cookie:")
- cookie= driver.get_cookies()
- print(cookie)
-
- #关闭浏览器
- print("关闭浏览器")
- time.sleep(5)
- #driver.close()
- driver.quit()
- #time.sleep(5)
-
- #数据库记录运行信息
- data = get_ip()
- data.append(keyword)
- data.append(target)
- data.append("success")
- data.append(page)
- data.append(index)
- insert_db(data)
-
- except:
- data = get_ip()
- data.append(keyword)
- data.append(target)
- data.append("faild")
- data.append(-1)
- data.append(-1)
- insert_db(data)
-
转载自吾爱破解