针对百家号抓取项目

针对百家号抓取项目

闲聊

放假的时候接到了一个抓取百度百家号的项目,由于种种原因没有完成。由于我的能力不够真的好尴尬。我的第一个项目就这样黄了。不过技术不行我可以学啊(嘿嘿嘿)

先说一下项目是怎么样的好le

项目思路

就是先解决比如http://baijiahao.baidu.com/u?app_id=1551032544962326这个链接里面的所有文章的标题链接阅读数和评论数。我们可以观察到这个网站是动态加载的你可以一直的往下翻一直到没有这个就是典型的XHR的加载手机版的淘宝也是这样弄得。xhr涉及了比较多的网页端的知识我了解的也不多。现在也只是会通过python发出XHR的请求然后获取我们想要的东西。这个就是一个模拟请求代码就是几行不用怎么说。

是获取那些百家号的id这个比较困难了因为他的id有16位暴力是不可能的。从别的地方也获取不到当时就是一阵懵逼。当我看到百度的时候。我才想到他们帮我们都弄好了嘿嘿谢谢百度的爬虫哈哈。当然我们的问题从爬百家号变成了爬百度搜索了。我的难题又来了百度搜索也不是我能够很轻松搞定的他的搜索结果不能够直接得到这就尴尬了。我开始是通过PHANTOMJS的这个的效率太低了我服务器cpu都跑满了一天也获得不到几个。最后我还是改变自己的思路了我可以不一次获得百度结果的760个所有结果每一次就获取一页通过不同的关键词来获取这样的话一切问题都变得简单了只要向百度发起get请求就可以了嘿嘿

贴出我的代码

bajihao.py

'''本模块是为了解决获取百家号url并且从这个url里面获取我们想要的新闻'''
import re
import time

import bs4
import requests
from selenium import webdriver
class sobaidu():
    '''sobaidu类实现通过百度搜索获取真实的url并且把url写入数据库'''

    def __init__(self):
        self.KEYFILENAME = "keylist.txt"
        self.URLFILENAME = "urllist.txt"
        self.KEYLIST = set()
        self.URLLIST = set()
        self.URLFILE = open(self.URLFILENAME, 'w')

    def _readkey(self):
        '''读取百度搜索所需要的所有关键词'''
        with open(self.KEYFILENAME) as keyklistfile:
            for i in keyklistfile.readlines():
                self.KEYLIST.add(i)
    def _changeurl(self, url):
        '''百度搜索结果url转换为真实的url'''
        try:
            req = requests.get(url+'&wd=')
            # time.sleep(1)
            # print(req.text)
            regx = r'http[s]*://baijiahao.baidu.com/[\S]*id=[0-9]*'
            pattern = re.compile(regx)
            match = re.findall(pattern, req.text)
            return match[0]
        except Exception as e:
            print(e)

    def _writetomysql(self):
        '''将真实url写入数据库'''
        pass

    def _writetofile(self,url):
        self.URLFILE.write(url)
        self.URLFILE.write('\n')

    def sobaidu(self):
        '''调用以上函数解决我们的问题'''
        # browser = webdriver.Chrome()
        browser = webdriver.PhantomJS()
        num = 0
        for key in self.KEYLIST:
            ''''doc'''
            num += 1
            now_num = 0
            browser.implicitly_wait(30)
            browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com)' + key) 
            while True:
                if now_num == 1:
                    try:
                        browser.find_element_by_xpath('//*[@id="page"]/a[10]').click()
                        time.sleep(2)
                    except Exception as e:
                        print(e)
                        print("有问题")
                        break
                now_num += 1
                print(now_num)
                source = browser.page_source
                soup = bs4.BeautifulSoup(source, 'lxml')
                print('next_page')
                for i in soup.findAll(class_='result c-container '):
                    url = i.find(class_='t').find('a').get('href')
                    # print(url)
                    # self.URLLIST.add(self._changeurl(url))
                    self._writetofile(self._changeurl(url))
                time.sleep(1)
                if now_num > 1:
                    try:
                        browser.find_element_by_xpath('//*[@id="page"]/a[11]').click()
                        time.sleep(1)
                    except:
                        print('not find next_button may be for the page end!!!')
                        break

class getappid:
    def __init__(self):
        self.URLFILENAME = "urllist.txt"
        self.APPIDLIST = "appid.txt"
        self.URLLIST = set()
        self.APPIDFILE = open(self.APPIDLIST, 'w')

    def _readurl(self):
        '''读取新闻页的url'''
        with open(self.URLFILENAME) as urllistfile:
            for i in urllistfile.readlines():
                self.URLLIST.add(i)
    def _writeappid(self, appid):
        self.APPIDFILE.write(appid)
        self.APPIDFILE.write('\n')
        print("写入成功")
    def getid(self):
        # browser = webdriver.PhantomJS()
        browser = webdriver.Chrome()
        browser.implicitly_wait(10)
        # browser.set_script_timeout(10)
        # browser.set_page_load_timeout(10)
        for url in self.URLLIST:
            browser.get(url)
            regx = r'http[s]*://baijiahao.baidu.com/u[\S]*id=[0-9]*'
            pattern = re.compile(regx)
            match = re.findall(pattern, browser.page_source)
            time.sleep(1)
            try:
                print(match[0])
                self._writeappid(match[0])
            except Exception as e:
                print('匹配失败')
def main():
    dsfsd = sobaidu()
    # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761')
    # print(strings)
    # # ###################### 奇怪的分割线 ###############
    dsfsd._readkey()
    print(len(dsfsd.KEYLIST))
    #########################  有点傻的分割线 #############
    dsfsd.sobaidu()
    # print(len(dsfsd.URLLIST))
    # for i in dsfsd.URLLIST:
    #     print(i)
    # ####################### 电脑有点卡的分割线 ###########
    dsfsd.URLFILE.close()
def getid():
    dsfsd = getappid()
    dsfsd._readurl()
    print(len(dsfsd.URLLIST))
    dsfsd.getid()
    dsfsd.APPIDFILE.close()

if __name__ == '__main__':
    # main()
    # getid()

get_url.py

import re
import time

import bs4
import requests
# from selenium import webdriver

class sobaidu():
    '''sobaidu类实现通过百度搜索获取真实的url并且把url写入数据库'''

    def __init__(self):
        self.KEYFILENAME = "keylist.txt"
        self.URLFILENAME = "urllist.txt"
        self.KEYLIST = set()
        self.URLLIST = set()
        self.URLFILE = open(self.URLFILENAME, 'w')

    def _readkey(self):
        '''读取百度搜索所需要的所有关键词'''
        with open(self.KEYFILENAME) as keyklistfile:
            for i in keyklistfile.readlines():
                self.KEYLIST.add(i)
    def _changeurl(self, url):
        '''百度搜索结果url转换为真实的url'''
        try:
            req = requests.get(url+'&wd=')
            # time.sleep(1)
            # print(req.text)
            regx = r'http[s]*://baijiahao.baidu.com/[\S]*id=[0-9]*'
            pattern = re.compile(regx)
            match = re.findall(pattern, req.text)
            return match[0]
        except Exception as e:
            print(e)

    def _writetomysql(self):
        '''将真实url写入数据库'''
        pass

    def _writetofile(self,url):
        self.URLFILE.write(url)
        self.URLFILE.write('\n')

    def sobaidu(self):
        '''调用以上函数解决我们的问题'''
        # browser = webdriver.Chrome()
        # # browser = webdriver.PhantomJS()
        headers = {
    'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    'accept-encoding': "gzip, deflate, sdch, br",
    'accept-language': "zh-CN,zh;q=0.8,en;q=0.6",
    'cache-control': "no-cache",
    'connection': "keep-alive",
    'host': "www.baidu.com",
    'upgrade-insecure-requests': "1",
    'user-agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    'postman-token': "d7d540ff-c479-210d-4d11-9b3deaba8395"
    }
        for key in self.KEYLIST:
            ''''doc'''
            now_num = 0
            # browser.implicitly_wait(30)
            try:
                # browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com)' + key)
                req = requests.get('http://www.baidu.com/s?wd=site:(baijiahao.baidu.com)'+ key, headers=headers)
            except Exception as e:
                time.sleep(20)
                continue
            source = req.text
            soup = bs4.BeautifulSoup(source, 'lxml')
                # print('next_page')
            for i in soup.findAll(class_='result c-container '):
                url = i.find(class_='t').find('a').get('href')
                print(url)
                try:
                    self._writetofile(url)
                except Exception as e:
                    pass
            time.sleep(1)
def main():
    dsfsd = sobaidu()
    # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761') 
    # print(strings)
    # # ###################### 奇怪的分割线 ###############
    dsfsd._readkey()
    print(len(dsfsd.KEYLIST))
    #########################  有点傻的分割线 #############
    dsfsd.sobaidu()
    # print(len(dsfsd.URLLIST))
    # for i in dsfsd.URLLIST:
    #     print(i)
    dsfsd.URLFILE.close()
if __name__ == '__main__':
    main()
    # getid()

示例keylist.txt

爱好者
爱河
爱辉区
爱舰
爱将
爱克斯光
爱丽舍
爱侣
爱民区
爱女
爱妻
爱情
爱情股
爱情剧
爱情戏
爱情秀

示例结果

1558203514244407
1553580958364599
1554481414859790
1554581149767353
1553311983508765
1550392115650114
1549296295514136
1554229445165637
1554203514772608
1552292596126106

ps:我把url变成了appid的形式免得博客里面出现一些无用的链接

发表评论