python+selenium模拟百度关键词排名点击

声明:代码为转载内容,仅限于参考学习python。

  1. # -*- coding: utf-8 -*-from selenium import webdriver
  2. import time
  3. import requests
  4. import random
  5. import os
  6. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  7. import traceback
  8. import urllib.request
  9. import pymysql
  10. import socket
  11. #import win32api #pip install pypiwin32
  12. #from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  13. #DesiredCapabilities.INTERNETEXPLORER[‘ignoreProtectedModeSettings’] = True
  14. #rasdial 宽带连接 19ab68—-643534
  15. def connect():
  16.     cmd_str = “rasdial %s %s %s” % (g_adsl_account[‘name’], g_adsl_account[‘username’], g_adsl_account[‘password’])
  17.     os.system(cmd_str)
  18.     time.sleep(5)
  19. #”rasdial 断开宽带连接 /disconnect”
  20. def disconnect():
  21.     cmd_str = “rasdial %s /disconnect” % g_adsl_account[‘name’]
  22.     os.system(cmd_str)
  23.     time.sleep(5)
  24. #获取ip地址
  25. def get_ip():
  26.     #return [‘ip’,’address’]
  27.     fp = urllib.request.urlopen(“http://ip.chinaz.com/getip.aspx”)
  28.     mybytes = fp.read()
  29.     # note that Python3 does not read the html code as string
  30.     # but as html code bytearray, convert to string with
  31.     mystr = mybytes.decode(“utf8”)
  32.     fp.close()
  33.     ip = mystr.find(“ip”)
  34.     add = mystr.find(“address”)
  35.     ip = mystr[ip+4:add-2]
  36.     address = mystr[add+9:-2]
  37.     return [ip,address]
  38. #将ip地址插入数据库
  39. def insert_db(ipdate):
  40.     #try:
  41.         #获取一个数据库连接,注意如果是UTF-8类型的,需要制定数据库
  42.         conn=pymysql.connect(host=’localhost’,user=’root’,passwd=”,port=3306,charset=’utf8′)
  43.         cur=conn.cursor()                              #获取一个游标对象
  44.         #cur.execute(“CREATE DATABASE zongzong”)          #执行对应的SQL语句
  45.         #exit()
  46.         cur.execute(“USE zongzong”)
  47.         #exit()
  48.         #cur.execute(“CREATE TABLE `ip_log` (`id` int(11) NOT NULL AUTO_INCREMENT,`ip` varchar(32) DEFAULT NULL,`address` varchar(64) DEFAULT NULL,`keyword` varchar(64) DEFAULT ”,`url` varchar(256) DEFAULT ”,`error` varchar(64) DEFAULT ”,`created_at` timestamp NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8;”)
  49.         #插入数据
  50.         ISOTIMEFORMAT=’%Y-%m-%d %X’
  51.         ipdate.append( time.strftime( ISOTIMEFORMAT, time.localtime() ))
  52.         cur.execute(“INSERT INTO ip_log(ip,address,keyword,url,error,page,rank,created_at) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)”,ipdate)
  53.         #cur.execute(“SELECT * FROM ip_log”)
  54.         #data=cur.fetchall()
  55.         #print(data)
  56.         cur.close()#关闭游标
  57.         conn.commit()#向数据库中提交任何未解决的事务,对不支持事务的数据库不进行任何操作
  58.         conn.close()#关闭到数据库的连接,释放数据库资源
  59.     #except:
  60.     #   print(“发生异常”)
  61. #获取搜素出来的url
  62. def get_search_url(driver):
  63.     urls = []
  64.     real = []
  65.     real_url = []
  66.     click_link = []
  67.     content = driver.find_element_by_css_selector(“div[id=\”content_left\”]”)
  68.     links = content.find_elements_by_tag_name(“a”)
  69.     for link in links:
  70.         if link.get_attribute(‘class’) == “c-showurl”:
  71.             real.append(link.text)
  72.             url = link.get_attribute(‘href’)
  73.             urls.append(url)
  74.             #解密url
  75.             header = requests.head(url).headers
  76.             is_append = True
  77.             for out_url in out_urls:
  78.                 if out_url in header[‘location’]:
  79.                     is_append = False
  80.                     break
  81.             if is_append == True:
  82.                 real_url.append(header[‘location’])
  83.                 #a标签对象
  84.                 click_link.append(link)
  85.     #print(real)
  86.     #print(urls)
  87.     #return urls
  88.     return [real_url,click_link]
  89. #function:解析加密url,剔除竞争对手的url
  90. # def get_real_url(urls):
  91.     # real_url = []
  92.     # for url in urls:
  93.         # header = requests.head(url).headers
  94.         # is_append = True
  95.         # for out_url in out_urls:
  96.             # if out_url in header[‘location’]:
  97.                 # is_append = False
  98.                 # break
  99.         # if is_append == True:
  100.             # real_url.append(header[‘location’])
  101.     # return real_url
  102. #function 目标地址是否在某个list中
  103. def get_urlIndex(tagurl,urls):
  104.     i = 0
  105.     has = -1
  106.     for url in urls:
  107.         if tagurl in url:
  108.             has = True
  109.             return i
  110.         i = i+1
  111.     return has
  112. #点击百度搜索内容下面的下一页
  113. def click_nextBtn(driver):
  114.     div = driver.find_element_by_css_selector(“div[id=\”page\”]”)
  115.     a = div.find_elements_by_tag_name(“a”)
  116.     for item in a:
  117.         print(item.text)
  118.         if item.text == “下一页>”:
  119.             item.click()
  120.     return driver
  121. #随机点击
  122. def click_search_url(driver,items):
  123.     urls = []
  124.     real = []
  125.     content = driver.find_element_by_css_selector(“div[id=\”content_left\”]”)
  126.     links = content.find_elements_by_tag_name(“a”)
  127.     i=0
  128.     ”’获取当前窗口”’
  129.     nowhandle = driver.current_window_handle
  130.     #allhandles=driver.window_handles
  131.     #for handle in allhandles:
  132.     #   print(‘….当前窗口….’,handle.title)
  133.     #exit()
  134.     for link in links:
  135.         if link.get_attribute(‘class’) == “c-showurl”:
  136.             if i in items:
  137.                 print(“随机点击item:”,i)
  138.                 print(link.get_attribute(‘href’),link.text)
  139.                 #exit()
  140.                 link.click()
  141.                 #停留在点击页面
  142.                 time.sleep(random.randint(5,10))
  143.                 ”’获取所有窗口”’
  144.                 allhandles=driver.window_handles
  145.                 #for handle in allhandles:
  146.                 #   print(‘….当前窗口….’,handle.title)
  147.                 #exit()
  148.                 ”’循环判断窗口是否为当前窗口”’
  149.                 for handle in allhandles:
  150.                     if handle != nowhandle:
  151.                         print(“切换到当前窗口”)
  152.                         driver.switch_to_window(handle)
  153.                         print(“title:”,driver.title)
  154.                         ”’关闭当前窗口”’
  155.                         driver.close()
  156.                         ”’回到原先的窗口”’
  157.                         print(“切换到原来的窗口”)
  158.                         driver.switch_to_window(nowhandle)
  159.                         print(“title:”,driver.title)
  160.                 print(“本次随机点击完毕!”)
  161.             i=i+1
  162. #获取随机点击的搜索页random.randint(0
  163. def get_random_index(index,len):
  164.     if index >= 8:
  165.         random_index = [
  166.             random.randint(0,4),random.randint(5,8)
  167.         ]
  168.     elif index>=4:
  169.         random_index = [
  170.             random.randint(0,3),random.randint(3,index)
  171.         ]
  172.     elif index>=0:
  173.         random_index = [
  174.             index
  175.         ]
  176.     elif index == -1:
  177.         if len <=5:
  178.             random_index = [
  179.                 random.randint(0,5)
  180.             ]
  181.         else:
  182.             random_index = [
  183.                 #random.randint(0,4),random.randint(5,len)
  184.                 random.randint(5,len)
  185.             ]
  186.     return random_index
  187. def getUA():
  188.     uaList = [
  189.         #360
  190.         “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36”,
  191.         #chrome
  192.         “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36”,
  193.         #”Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36″,
  194.         “Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36”,
  195.         #firefox
  196.         #”Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0″,
  197.         “Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0”,
  198.         #ie11
  199.         #”Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko”,
  200.         #ie8
  201.         #”Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357)”,
  202.         #2345王牌
  203.         #”Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018″,
  204.         #搜狗
  205.         #”Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0″,
  206.         #opera
  207.         “Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60”
  208.     ]
  209.     headers = random.choice(uaList)
  210.     return headers
  211. #屏幕浏览器窗口大小
  212. def getWindowSize():
  213.     wind_size = [
  214.         [1920,1080],
  215.         [1600,900],
  216.         [1280,720]
  217.     ]
  218.     headers = random.choice(wind_size)
  219.     return headers
  220. #屏幕分辨率设置
  221. def setDisplay():
  222.     display_size = [
  223.         [1920,1080],
  224.         [1680,1050],
  225.         [1600,900],
  226.         [1440,900],
  227.         [1400,1050]
  228.     ]
  229.     d_size = random.choice(display_size)
  230.     dm = win32api.EnumDisplaySettings(None, 0)
  231.     dm.PelsWidth = d_size[0]
  232.     dm.PelsHeight = d_size[1]
  233.     dm.BitsPerPel = 32
  234.     dm.DisplayFixedOutput = 0
  235.     win32api.ChangeDisplaySettings(dm, 0)
  236. #拨号 19ab68—-643534
  237. g_adsl_account = {
  238.     “name”:”宽带连接”,
  239.     “username”:”19ab68″,
  240.     “password”:”643534″
  241. }
  242. #屏蔽点击的地址(竞争对手)
  243. out_urls = [
  244.     ‘zhimo.yuanzhumuban.cc’,
  245.     ‘bbs.yuanzhumuban.cc’,
  246.         ‘http://money.163.com/15/0416/11/ANANRECC00253B0H.html’
  247. ]
  248. ##内页词
  249. targetURL = [
  250.         [‘http://www.hkuws.com’,’注册离岸公司’],
  251.     [‘zs.efu.com.cn/mornfeeit/’,’梦菲雪’],
  252.     [‘zs.efu.com.cn/chengshijiaren/’,’城市佳人’],
  253.     [‘www.kidsnet.cn/exposition’,’童装展会’],
  254.     #[‘top.kidsnet.cn/’,’童装加盟排行榜’],
  255.     #[‘www.nynet.com.cn/’,’内衣网’],
  256.     #[‘www.nzw.cn/’,’女装网’],
  257.     [‘zs.efu.com.cn/ks/’,’卡索’],
  258.     [‘zs.efu.com.cn/distin-kidny/’,’迪斯廷凯’],
  259.     [‘zs.efu.com.cn/fuzhuang/luyidigao/’,’路易迪高童装代{过}{滤}理’],
  260.     [‘brand.efu.com.cn/brandshow-1221090.html’,’凯帝龙驰’],
  261.     [‘zs.efu.com.cn/rabbitjero/’,’兔子杰罗’],
  262.     [‘zs.efu.com.cn/wmprince/’,’西瓜王子’],
  263.         [‘zs.efu.com.cn/betu’,’百图’],
  264.         [‘zs.efu.com.cn/pepco/’,’小猪班纳’],
  265.     #[‘http://news.ifeng.com/a/20160518/48795120_0.shtml’,’华夏信财’],
  266.     [‘http://weibo.com/huaxiafinance’,’华夏信财’],
  267.     [‘http://p2p.hexun.com/2016-04-26/183531215.html’,’华夏信财’],
  268.     #[‘http://news.xinhuanet.com/fortune/2016-04/26/c_128932834.htm’,’华夏信财’],
  269.     [‘http://www.xcf.cn/gdyw/201605/t20160526_772682.htm’,’华夏信财’],
  270.     [‘http://www.huaxiaoxia.com/’,’华夏信财’],
  271.         #[‘https://lc.huaxiafinance.com/’,’华夏信财’],
  272.         [‘so.tedu.cn’,’网络营销培训机构’],
  273.         [‘www.cosatto.net.cn’,’个性安全座椅’],
  274.         [‘www.kaihuata.com/’,’开化旅游’],
  275.         #[‘www.kaihuata.com/’,’开化’],
  276. ]
  277. for targetInfo in targetURL:
  278.     try:
  279.         #更换ip
  280.         disconnect()
  281.         connect()
  282.         while(1):
  283.                     try:
  284.                         socket.gethostbyname(“baidu.com”)
  285.                         break;
  286.                     except:
  287.                         disconnect()
  288.                         connect()
  289.         #更换分辨率
  290.         #setDisplay()
  291.         #启动浏览器
  292.         #driver = webdriver.Ie()
  293.         #driver = webdriver.Chrome()
  294.         #driver = webdriver.Firefox()
  295.         #设置PhantomJS的user_agent
  296.         dcap = dict(DesiredCapabilities.PHANTOMJS)
  297.         user_agent = getUA()
  298.         print(user_agent)
  299.         dcap[“phantomjs.page.settings.userAgent”] = (
  300.                 user_agent
  301.         )
  302.         #dcap[“phantomjs.page.settings.resourceTimeout”] = (15000)
  303.         dcap[“phantomjs.page.settings.loadImages”] = (False)
  304.         driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=[‘–load-images=no’])
  305.         # UA = getUA()
  306.         # print(UA)
  307.         # webdriver.DesiredCapabilities.PHANTOMJS[‘phantomjs.page.customHeaders.User-Agent’] = UA
  308.         # driver = webdriver.PhantomJS()
  309.         driver.implicitly_wait(30)
  310.         #清cookie
  311.         driver.delete_all_cookies()
  312.         #driver.maximize_window() # 浏览器全屏显示
  313.         #打开百度
  314.         driver.get(“http://www.baidu.com/”)
  315.         #driver.get(“http://mch.weiba01.com/2.php”)
  316.         #设置浏览器窗口大小
  317.         window_size = getWindowSize()
  318.         driver.set_window_size(window_size[0], window_size[1])
  319.         #搜索某个关键词
  320.         print(‘打开百度成功’,driver.title)
  321.         target = targetInfo[0]
  322.         keyword = targetInfo[1]
  323.         if len(targetInfo)>2:
  324.                 error_keyword = targetInfo[random.randint(2,len(targetInfo)-1)]
  325.         print(“>>>>>>>>>>>>>>>点击的关键词:”,keyword,”—>目标地址:”,target,”>>>>>>>>>>>>>>>>>>>>”)
  326.         if len(targetInfo)>2:
  327.             #模拟错误关键词
  328.             print(“点击错误关键词:”,error_keyword);
  329.             driver.find_element_by_id(“kw”).send_keys(error_keyword)
  330.             time.sleep(2)
  331.             driver.find_element_by_id(“su”).click()
  332.             time.sleep(5)
  333.             driver.find_element_by_id(“kw”).clear()
  334.             time.sleep(2)
  335.             print(“错误关键词点击完毕”)
  336.         driver.find_element_by_id(“kw”).send_keys(keyword)
  337.         #time.sleep(2)
  338.         #点击搜索按钮
  339.         print(“…开始点击搜索按钮..”)
  340.         driver.find_element_by_id(“su”).click()
  341.         #exit()
  342.         print(“…点击完毕..”)
  343.         time.sleep(2)
  344.         #获取搜索结果页 0:着陆页  1:对应的链接对象
  345.         urls_res = get_search_url(driver)
  346.         real_urls = urls_res[0]
  347.         #get_search_url(driver)[1][2].click()
  348.         #real_urls = get_real_url(urls)
  349.         print(“搜索出来的可点击着陆页个数:”,len(real_urls))
  350.         print(real_urls)
  351.         index = get_urlIndex(target,real_urls)
  352.         print(“目标index:”,index)
  353.         page = 1
  354.         while index == -1 and page <= 4:
  355.             if page == 1:
  356.                 #点击前面的几个着陆页,模拟用户真实行为
  357.                 items = get_random_index(index,len(real_urls))
  358.                 #items = [4]
  359.                 print(items)
  360.                 click_search_url(driver,items)
  361.             #下一页
  362.             driver = click_nextBtn(driver)
  363.             time.sleep(3)
  364.             urls_res = get_search_url(driver)
  365.             real_urls = urls_res[0]
  366.             #real_urls = get_real_url(urls)
  367.             print(real_urls)
  368.             index = get_urlIndex(target,real_urls)
  369.             page = page+1
  370.         if index > 4 and page == 1:
  371.             #第一页,随机点击两个或一个
  372.             int = random.randint(1,2)
  373.             if int == 2:
  374.                 items = get_random_index(index,len(real_urls))
  375.             else:
  376.                 items = [1]
  377.             print(items)
  378.             click_search_url(driver,items)
  379.         if page >=5:
  380.             print(“没有找到目标地址,放弃搜索…”)
  381.             print(“关闭浏览器”)
  382.             driver.quit()
  383.             time.sleep(5)
  384.             data = get_ip()
  385.             data.append(keyword)
  386.             data.append(target)
  387.             data.append(“no_find”)
  388.             data.append(-1)
  389.             data.append(-1)
  390.             insert_db(data)
  391.             continue
  392.         print(“目标在page”,page,”当前排名:”,index,real_urls[index])
  393.         print(“反问最后的目标页…”)
  394.         #driver.get(real_urls[index])
  395.         urls_res[1][index].click()
  396.         time.sleep(5)
  397.         nowhandle = driver.current_window_handle
  398.         allhandles = driver.window_handles
  399.         #目标页和搜索栏目页切换下
  400.         for handle in allhandles:
  401.             if handle != nowhandle:
  402.                 print(“切换到当前窗口”)
  403.                 driver.switch_to_window(handle)
  404.                 stime = random.randint(15,25)
  405.                 #stime = 5;
  406.                 print(“目标页title:”,driver.title,”停留–>”,stime)
  407.                 time.sleep(stime)
  408.                 ”’关闭当前窗口”’
  409.                 driver.close()
  410.                 ”’回到原先的窗口”’
  411.                 print(“切换到原来的窗口”)
  412.                 driver.switch_to_window(nowhandle)
  413.                 print(“title:”,driver.title)
  414.         #time.sleep(random.randint(40,60))
  415.         #time.sleep(5)
  416.         #清除所有cookie
  417.         print(“打印cookie”)
  418.         cookie= driver.get_cookies()
  419.         print(cookie)
  420.         print(“清除cookie”)
  421.         driver.delete_all_cookies()
  422.         print(“打印cookie:”)
  423.         cookie= driver.get_cookies()
  424.         print(cookie)
  425.         #关闭浏览器
  426.         print(“关闭浏览器”)
  427.         time.sleep(5)
  428.         #driver.close()
  429.         driver.quit()
  430.         #time.sleep(5)
  431.         #数据库记录运行信息
  432.         data = get_ip()
  433.         data.append(keyword)
  434.         data.append(target)
  435.         data.append(“success”)
  436.         data.append(page)
  437.         data.append(index)
  438.         insert_db(data)
  439.     except:
  440.         data = get_ip()
  441.         data.append(keyword)
  442.         data.append(target)
  443.         data.append(“faild”)
  444.         data.append(-1)
  445.         data.append(-1)
  446.         insert_db(data)
    © 版权声明
    THE END
    喜欢就支持一下吧
    点赞0 分享
    评论 抢沙发

    请登录后发表评论

      暂无评论内容