You& Data_Science & Life

NaverNews_Crawler

Naver News Cralwer

from bs4 import BeautifulSoup
from datetime import datetime, timedelta, date
import requests
import pandas as pd
import re
import os
from math import *

def crawler(query,sort,s_date,e_date):
    query = query.replace(' ', '+')
    s_from = s_date.replace(".","")
    e_to = e_date.replace(".","")

    # find max page number
    # 1. url setting
    basic_url = "https://search.naver.com/search.naver?where=news"
    url = str(basic_url + '&query=' + query + "&sm=tab_opt&sort="+ str(sort)+
              "&nso=so%3Ar%2Cp%3Afrom" + s_from + "to" + e_to)

    # 2. get url
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    total_count = soup.find_all(attrs={'class':'title_desc all_my'})[0].get_text()
    total_count = total_count.split('/')[1].replace(',','')
    total_count = int(re.findall('\d+', total_count)[0])

#     maxpage = ceil(total_count/10)
#     maxpage_t =(int(maxpage)-1)*10+1   # 11= 2페이지 21=3페이지 31=4페이지  ...81=9페이지 , 91=10페이지, 101=11페이지
#     print('maxPage number : {}'.format(maxpage))

#     page = 1  
#     while page <= maxpage_t:
#         # 1. url setting
#         basic_url = "https://search.naver.com/search.naver?where=news"
#         url = str(basic_url + '&query=' + query + "&sm=tab_opt&sort="+ str(sort)+
#                   "&nso=so%3Ar%2Cp%3Afrom" + s_from + "to" + e_to +
#                   "%2Ca%3A&start=" + str(page))

#         # 2. get url
#         response = requests.get(url)
#         html = response.text

#         #뷰티풀소프의 인자값 지정
#         soup = BeautifulSoup(html, 'html.parser')

#         #<a>태그에서 제목과 링크주소 추출
#         atags = soup.select('._sp_each_title')
#         for atag in atags:
#             title_text.append(atag.text)     #제목
#             link_text.append(atag['href'])   #링크주소

#         #신문사 추출
#         source_lists = soup.select('._sp_each_source')
#         for source_list in source_lists:
#             source_text.append(source_list.text)    #신문사

#         #날짜 추출
#         date_lists = soup.select('.txt_inline')
#         for date_list in date_lists:
#             test=date_list.text   
#             date_cleansing(test)  #날짜 정제 함수사용

#         #본문요약본
#         contents_lists = soup.select('ul.type01 dl')
#         for contents_list in contents_lists:
#             #print('==='*40)
#             #print(contents_list)
#             contents_cleansing(contents_list) #본문요약 정제화


#         #모든 리스트 딕셔너리형태로 저장
#         result= {"date" : date_text ,
#                  "title":title_text ,
#                  "source" : source_text ,
#                  "contents": contents_text ,
#                  "link":link_text,
#                  "query" : query}
# #         print(page)

#         df = pd.DataFrame(result)  #df로 변환
#         page += 10

    return total_count#, df