# 検索するYahooニュース記事URLを格納する変数
url = ""
# スクレイピングメソッド
def news_scraping_yahoo(url):
result_news = []
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
if soup.find(class_='article_body') is not None:
news_content = soup.find(class_='article_body').text
news_content = shap_text(news_content)
if not news_content == "":
result_news.append([news_content, url])
else:
logging.warning("「{}」の記事の内容を要約できませんでした".format(url))
else:
logging.warning("「{}」の記事の内容が見つかりませんでした".format(url))
return result_news
# 記事の整形メソッド
def shap_text(content):
news_content = "".join(content.splitlines())
news_content = re.sub(
r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)", "", news_content)
news_content = re.sub(re.compile("[!-/:-@[-`{-~]"), '', news_content)
news_content = ''.join(
c for c in news_content if c not in emoji.UNICODE_EMOJI)
news_content = re.sub(r' ', '', news_content)
news_content = re.sub(r' ', '', news_content)
if not len(re.sub(r'[^ ]+\.[^ ]+', '', news_content)) == 0:
news_content = re.sub(r'[^ ]+\.[^ ]+', '', news_content)
news_content = re.sub(r'[︰-@]', '', news_content)
return news_content
text = news_scraping_yahoo(url)[0][0]