跳轉中,訪問新網域網站 https://samiliu.xyz

logo头像
Snippet 博客主题

Python 多執行緒 (multiprocessing)

multiprocessing 筆記

Process


導入套件與進程 function

import time
import multiprocessing as mp

def claw(website):
    time.sleep(len(website))
    print(f'{website}: hello world')

執行多進程

# 建立 Process
## 當 args 的參數只要一個值時,需要加逗號,否則會報錯
work_1 = mp.Process(target=claw, args=('websiteAAAAAA',))
work_2 = mp.Process(target=claw, args=('websiteB',))

# 執行 Process
print('Start Process')
work_1.start()
work_2.start()

# 等待 Process 完成
print('Wait...')
work_1.join()
work_2.join()
print('Done!!!')

打印結果

Start Process
Wait...
websiteB[8]: hello world
websiteAAAAAA[13]: hello world
Done!!!

Pool 進程池

導入套件與進程 function

使用多進程爬 Google 新聞前五則

import requests
import pandas as pd
from bs4 import BeautifulSoup
import multiprocessing as mp

def news_craw(topic):
    # 取得該分類下前五則新聞標題與連結
    google_news = 'https://news.google.com'
    url = f'{google_news}/topics/{topic}?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant'
    res = requests.get(url)
    soup = BeautifulSoup(res.text.encode('utf-8'), "html.parser")
    news = []
    for i in range(5):
        news.append({
            'title': soup.select('div.xrnccd h3')[i].text,
            'link': google_news + soup.select('div.xrnccd h3 a')[i]['href'][1:]
        })
    return news

新聞主題

google_news_topic = [
    'CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFptTXpJU0JYcG9MVlJYS0FBUAE',        ## 台灣
    'CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JYcG9MVlJYR2dKVVZ5Z0FQAQ', ## 國際
    'CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JYcG9MVlJYR2dKVVZ5Z0FQAQ', ## 商業
    'CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JYcG9MVlJYR2dKVVZ5Z0FQAQ'  ## 娛樂
]

執行多進程

pool.map

# processes 預設使用電腦核心數
with mp.Pool(processes=4) as pool:
    res = pool.map(news_craw, google_news_topic)

new_list = [item for news in res for item in news]
df = pd.DataFrame(new_list)
df

pool.apply_sync

pool = mp.Pool(processes=4)

multiple_res = []
for topic in google_news_topic:
    multiple_res.append(pool.apply_async(news_craw, (topic,)))

pool.close()
pool.join()  # 待程序完成

# 使用 get 取得回傳值
new_list = [item for res in multiple_res for item in res.get()]
df = pd.DataFrame(new_list)
df

打印結果

在使用上兩者結果是一樣的