데이터 분석/Python

[텍스트 마이닝-수집] 네이버 블로그 스크래핑

초코레모네이드 2023. 3. 16. 16:24

 

import pandas as pd
import time
import urllib.request
from selenium.common.exceptions import NoSuchElementException, UnexpectedAlertPresentException, TimeoutException
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm.notebook import tqdm


# 웹드라이버 설정
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)


from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36')
driver = webdriver.Chrome(executable_path ='chromedriver.exe',options = chrome_options)

driver.maximize_window()
driver.get("https://www.naver.com/")
time.sleep(2)

query_text = input("검색어 입력: ")

element = driver.find_element(By.ID, "query");

element.send_keys(query_text)
element.submit()
time.sleep(2)

driver.find_element(By.LINK_TEXT, "VIEW").click()
time.sleep(1)
driver.find_element(By.LINK_TEXT, "블로그").click()
time.sleep(1)
driver.find_element(By.LINK_TEXT, "옵션").click()
time.sleep(1)
driver.find_element(By.CSS_SELECTOR, "#snb > div.api_group_option_sort._search_option_detail_wrap > ul > li.bx.lineup > div > div > a:nth-child(2)").click()
time.sleep(1)

# 옵션 > 기간 >  직접입력 클릭
driver.find_element(By.CSS_SELECTOR, "#snb > div.api_group_option_sort._search_option_detail_wrap > ul > li.bx.term > div > div.option > a.txt.txt_option._calendar_select_trigger").click()
time.sleep(1)

# 기간 직접입력 시작과 끝의 앞부분 동일 CSS 코드 변수로 담기 (코드 단순화용)
GS_head_css = "#snb > div.api_group_option_sort._search_option_detail_wrap > ul > li.bx.term > div > div.api_select_option.type_calendar._calendar_select_layer > "
# 기간 직접입력 년월일 앞 CSS 코드 변수로 담기 (코드 단순화용)
GSYMD_head_css = "#snb > div.api_group_option_sort._search_option_detail_wrap > ul > li.bx.term > div > div.api_select_option.type_calendar._calendar_select_layer > div.select_wrap._root > "

# 기간 설정시작 클릭
driver.find_element(By.CSS_SELECTOR, GS_head_css + "div.set_calendar > span:nth-child(1) > a").click()
time.sleep(1)

# 년 - 2020(18), 2021(19), 2022(20) 클릭
driver.find_element(By.CSS_SELECTOR, GSYMD_head_css + "div:nth-child(1) > div > div > div > ul > li:nth-child(20)").click()
time.sleep(1)
# 월 - 1월 클릭
driver.find_element(By.CSS_SELECTOR, GSYMD_head_css + "div:nth-child(2) > div > div > div > ul > li:nth-child(1)").click()
time.sleep(1)
# 일 - 1일 클릭
driver.find_element(By.CSS_SELECTOR, GSYMD_head_css + "div:nth-child(3) > div > div > div > ul > li:nth-child(1)").click()
time.sleep(1)


# 기간 설정끝 클릭
driver.find_element(By.CSS_SELECTOR, GS_head_css + "div.set_calendar > span:nth-child(3) > a").click()
time.sleep(1)

# 년 - 2020(18), 2021(19), 2022(20) 클릭
driver.find_element(By.CSS_SELECTOR, GSYMD_head_css + "div:nth-child(1) > div > div > div > ul > li:nth-child(20)").click()
time.sleep(1)
# 월 클릭
driver.find_element(By.CSS_SELECTOR, GSYMD_head_css + "div:nth-child(2) > div > div > div > ul > li:nth-child(11)").click()
time.sleep(1)
# 일 클릭
driver.find_element(By.CSS_SELECTOR, GSYMD_head_css + "div:nth-child(3) > div > div > div > ul > li:nth-child(28)").click()
time.sleep(1)

# 기간 설정란 적용 버튼 클릭
driver.find_element(By.CSS_SELECTOR, GS_head_css + "div.btn_area > button").click()
time.sleep(1)


#셀레니움 스크롤 끝까지 내려도 계속 내리는 페이지라면
prev_height = driver.execute_script("return document. body.scrollHeight")

while True:
    #첫번째로 스크롤 내리기
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    #시간대기
    time.sleep(3)
    #현재높이 저장
    current_height = driver.execute_script("return document. body.scrollHeight")
    #현재높이와 끝의 높이가 끝이면 탈출
    if(current_height == prev_height):
        break
    else:
        prev_height = driver.execute_script("return document.body.scrollHeight")

# 검색 결과 블로그글 url과 제목 가져오기
class_articles = ".api_txt_lines.total_tit"
url_link = driver.find_elements(By.CSS_SELECTOR, class_articles)
# 글 작성 일자 가져오기
class_datetime = ".sub_time.sub_txt"
date_time = driver.find_elements(By.CSS_SELECTOR, class_datetime)

url_list = []
title_list = []
date_list = []

for article in url_link:
    url = article.get_attribute('href')
    url_list.append(url)
for article in url_link:
    title = article.text
    title_list.append(title)
for date in date_time:
    datetime = date.text
    date_list.append(datetime)


# 데이터 프레임 저장

 
 
 

import pandas as pd
import numpy as np
import re
import os
import sys
from tqdm.notebook import tqdm
import pandas as pd
import time
import urllib.request
from selenium.common.exceptions import NoSuchElementException, UnexpectedAlertPresentException 
from selenium import webdriver
from selenium.webdriver.common.by import By

path = "chromedriver.exe"

driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(3)

# sep 구분자는 필요에 따라 추가. read_csv() 의 디폴트 값은 ',' 임.
df = pd.read_csv("파일경로", sep='\t', encoding='UTF-8')
print(df.shape, df.columns)
df.head()

blog_links = df['url'].to_list()

contents = []
datetime = []
old_blogs = []

for i in tqdm(blog_links):
    driver.get(i)
    time.sleep(1)
    try:
        driver.switch_to.frame("mainFrame")
    except UnexpectedAlertPresentException as e:
        print('{} 게시글이 비공개로 전환되었거나 게시판이 바뀜.'.format(i))
        time.sleep(2)
        driver.switch_to.frame('mainFrame')
    try:
        a = driver.find_element(By.CSS_SELECTOR,'div.se-main-container').text
        d = driver.find_element(By.CSS_SELECTOR, "div.se-component-content > div > div.blog2_container > span.se_publishDate.pcol2").text
        contents.append(a)
        datetime.append(d)
    # NoSuchElement 오류시 예외처리(구버전 블로그에 적용)
    except NoSuchElementException:
        a = driver.find_element(By.CSS_SELECTOR,'div#content-area').text
        contents.append(a)
        old_blogs.append(i)
728x90