午夜国产狂喷潮在线观看|国产AⅤ精品一区二区久久|中文字幕AV中文字幕|国产看片高清在线

    python實現(xiàn)多線程采集的2個代碼例子
    來源:易賢網(wǎng) 閱讀:1919 次 日期:2014-07-08 19:05:14
    溫馨提示:易賢網(wǎng)小編為您整理了“python實現(xiàn)多線程采集的2個代碼例子”,方便廣大網(wǎng)友查閱!

    這篇文章主要介紹了python多線程采集代碼例子,使用了Threading、Queue、MySQLdb等模塊,需要的朋友可以參考下。

    代碼一:

    #!/usr/bin/python

    # -*- coding: utf-8 -*-

    #encoding=utf-8

    import threading

    import Queue

    import sys

    import urllib2

    import re

    import MySQLdb

    #

    # 數(shù)據(jù)庫變量設(shè)置

    #

    DB_HOST = '127.0.0.1'

    DB_USER = "XXXX"

    DB_PASSWD = "XXXXXXXX"

    DB_NAME = "xxxx"

    #

    # 變量設(shè)置

    #

    THREAD_LIMIT = 3

    jobs = Queue.Queue(5)

    singlelock = threading.Lock()

    info = Queue.Queue()

    def workerbee(inputlist):

        for x in xrange(THREAD_LIMIT):

            print 'Thead {0} started.'.format(x)

            t = spider()

            t.start()

        for i in inputlist:

            try:

                jobs.put(i, block=True, timeout=5)

            except:

                singlelock.acquire()

                print "The queue is full !"

                singlelock.release()

        # Wait for the threads to finish

        singlelock.acquire()        # Acquire the lock so we can print

        print "Waiting for threads to finish."

        singlelock.release()        # Release the lock

        jobs.join()              # This command waits for all threads to finish.

        # while not jobs.empty():

        #   print jobs.get()

    def getTitle(url,time=10):

        response = urllib2.urlopen(url,timeout=time)

        html = response.read()

        response.close()

        reg = r'<title>(.*?)</title>'

        title = re.compile(reg).findall(html)

        # title = title[0].decode('gb2312','replace').encode('utf-8')

        title = title[0]

        return title

    class spider(threading.Thread):

        def run(self):

            while 1:

                try:

                    job = jobs.get(True,1)

                    singlelock.acquire()

                    title = getTitle(job[1])

                    info.put([job[0],title], block=True, timeout=5)

                    # print 'This {0} is {1}'.format(job[1],title)

                    singlelock.release()

                    jobs.task_done()

                except:

                    break;

    if __name__ == '__main__':

        con = None

        urls = []

        try:

            con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)

            cur = con.cursor()

            cur.execute('SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10')

            rows = cur.fetchall()

            for row in rows:

                # print row

                urls.append([row[0],row[1]])

            workerbee(urls)

            while not info.empty():

                print info.get()

        finally:

            if con:

                con.close()代碼二:

    #!/usr/bin/python

    # -*- coding: utf-8 -*-

    #encoding=utf-8

    #Filename:robot.py

    import threading,Queue,sys,urllib2,re

    #

    # 變量設(shè)置

    #

    THREAD_LIMIT = 3        #設(shè)置線程數(shù)

    jobs = Queue.Queue(5)      #設(shè)置隊列長度

    singlelock = threading.Lock()    #設(shè)置一個線程鎖,避免重復(fù)調(diào)用

    urls = ['http://xxx.com/w/n/2013-04-28/1634703505.shtml','http://xxx.com/w/n/2013-04-28/1246703487.shtml','http://xxx.com/w/n/2013-04-28/1028703471.shtml','http://xxx.com/w/n/2013-04-27/1015703426.shtml','http://xxx.com/w/n/2013-04-26/1554703373.shtml','http://xxx.com/w/n/2013-04-26/1512703346.shtml','http://xxx.com/w/n/2013-04-26/1453703334.shtml','http://xxx.com/w/n/2013-04-26/1451703333.shtml','http://xxx.com/w/n/2013-04-26/1445703329.shtml','http://xxx.com/w/n/2013-04-26/1434703322.shtml','http://xxx.com/w/n/2013-04-26/1433703321.shtml','http://xxx.com/w/n/2013-04-26/1433703320.shtml','http://xxx.com/w/n/2013-04-26/1429703318.shtml','http://xxx.com/w/n/2013-04-26/1429703317.shtml','http://xxx.com/w/n/2013-04-26/1409703297.shtml','http://xxx.com/w/n/2013-04-26/1406703296.shtml','http://xxx.com/w/n/2013-04-26/1402703292.shtml','http://xxx.com/w/n/2013-04-26/1353703286.shtml','http://xxx.com/w/n/2013-04-26/1348703284.shtml','http://xxx.com/w/n/2013-04-26/1327703275.shtml','http://xxx.com/w/n/2013-04-26/1239703265.shtml','http://xxx.com/w/n/2013-04-26/1238703264.shtml','http://xxx.com/w/n/2013-04-26/1231703262.shtml','http://xxx.com/w/n/2013-04-26/1229703261.shtml','http://xxx.com/w/n/2013-04-26/1228703260.shtml','http://xxx.com/w/n/2013-04-26/1223703259.shtml','http://xxx.com/w/n/2013-04-26/1218703258.shtml','http://xxx.com/w/n/2013-04-26/1202703254.shtml','http://xxx.com/w/n/2013-04-26/1159703251.shtml','http://xxx.com/w/n/2013-04-26/1139703233.shtml']

    def workerbee(inputlist):

      for x in xrange(THREAD_LIMIT):

        print 'Thead {0} started.'.format(x)

        t = spider()

        t.start()

      for i in inputlist:

        try:

          jobs.put(i, block=True, timeout=5)

        except:

          singlelock.acquire()

          print "The queue is full !"

          singlelock.release()

      # Wait for the threads to finish

      singlelock.acquire()    # Acquire the lock so we can print

      print "Waiting for threads to finish."

      singlelock.release()    # Release the lock

      jobs.join()       # This command waits for all threads to finish.

      # while not jobs.empty():

      #  print jobs.get()

    def getTitle(url,time=10):

      response = urllib2.urlopen(url,timeout=time)

      html = response.read()

      response.close()

      reg = r'<title>(.*?)</title>'

      title = re.compile(reg).findall(html)

      title = title[0].decode('gb2312','replace').encode('utf-8')

      return title

    class spider(threading.Thread):

      def run(self):

        while 1:

          try:

            job = jobs.get(True,1)

            singlelock.acquire()

            title = getTitle(job)

            print 'This {0} is {1}'.format(job,title)

            singlelock.release()

            jobs.task_done()

          except:

            break;

    if __name__ == '__main__':

      workerbee(urls)

    更多信息請查看IT技術(shù)專欄

    更多信息請查看腳本欄目
    易賢網(wǎng)手機網(wǎng)站地址:python實現(xiàn)多線程采集的2個代碼例子
    由于各方面情況的不斷調(diào)整與變化,易賢網(wǎng)提供的所有考試信息和咨詢回復(fù)僅供參考,敬請考生以權(quán)威部門公布的正式信息和咨詢?yōu)闇剩?/div>

    2025國考·省考課程試聽報名

    • 報班類型
    • 姓名
    • 手機號
    • 驗證碼
    關(guān)于我們 | 聯(lián)系我們 | 人才招聘 | 網(wǎng)站聲明 | 網(wǎng)站幫助 | 非正式的簡要咨詢 | 簡要咨詢須知 | 新媒體/短視頻平臺 | 手機站點 | 投訴建議
    工業(yè)和信息化部備案號:滇ICP備2023014141號-1 云南省教育廳備案號:云教ICP備0901021 滇公網(wǎng)安備53010202001879號 人力資源服務(wù)許可證:(云)人服證字(2023)第0102001523號
    聯(lián)系電話:0871-65099533/13759567129 獲取招聘考試信息及咨詢關(guān)注公眾號:hfpxwx
    咨詢QQ:1093837350(9:00—18:00)版權(quán)所有:易賢網(wǎng)