티스토리 뷰
반응형
Python Crawling Useful features
- Read Excel File & Show Progress bar & Make DataFrame
import pandas as pd
from tqdm import tqdm
file_name = 'test_file'
file_df = pd.read_excel('C:\\Users\\cristoval\\Desktop\\data\\' + file_name + '.xlsx')
data = {'id': [], 'title': [], 'link' : []}
result_df = pd.DataFrame(data=data)
for idx, row in tqdm(file_df.iterrows()):
# do something
result_df = result_df.append({'id': _id, 'title': title, 'link' : link}, ignore_index=True)
result_file_name = 'result_file'
result_df.to_excel('C:\\Users\\cristoval\\Desktop\\data\\{0}.xlsx'.format(result_file_name), index=False)
- Selenium Chrome debug mode
from selenium import webdriver
import subprocess
import shutil
from selenium.webdriver.chrome.options import Options
try:
shutil.rmtree(r"c:\chrometemp") # remoce cookie/cache file
except FileNotFoundError:
pass
# chrome.exe file path : C:\Program Files\Google\Chrome\Application\chrome.exe
# chrome port : 9222
# cookie/cache file path : C:\chrometemp
# chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\chrometemp"
subprocess.Popen(r'C:\Program Files\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\chrometemp"')
option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
# chromedriver.exe path & Set option
driver = webdriver.Chrome('C:\Program Files\python\chromedriver.exe', options=option)
driver.implicitly_wait(10)
URL = 'https://data-make.tistory.com/'
driver.get(url=URL)
driver.implicitly_wait(time_to_wait=1)
driver.close()
Example Code
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from tqdm import tqdm
import pandas as pd
import subprocess
import shutil
########################
# Read Excel File
########################
file_name = 'example'
file_df = pd.read_excel('C:\\Users\\cristoval\\Desktop\\data\\' + file_name + '.xlsx')
##########################
# Run Chrome Debug Mode
##########################
try:
shutil.rmtree(r"c:\chrometemp") # remove cookie/cache file
except FileNotFoundError:
pass
subprocess.Popen(r'C:\Program Files\Google\Chrome\Application\chrome.exe --remote-debugging-port=9922 --user-data-dir="C:\chrometemp"')
option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9922")
driver = webdriver.Chrome('C:\Program Files\python\chromedriver.exe', options=option)
driver.implicitly_wait(10)
###################
# Start Crawling
###################
data = {'id': [], 'title': [], 'link' : []}
result_df = pd.DataFrame(data=data)
for idx, row in tqdm(file_df.iterrows()):
URL = 'https://data-make.tistory.com/' + str(row['id'])
driver.get(url=URL)
driver.implicitly_wait(time_to_wait=1)
link = ''
title = ''
try:
# Click on a specific element
element_btn = driver.find_element_by_id('id-name')
element_btn.click()
# Get data by element
element_box = driver.find_element_by_class_name('tit_post')
link = element_box.find_element_by_tag_name('a').get_attribute('href')
title = element_box.find_element_by_tag_name('a').text
except:
print('fail')
continue
# do something
result_df = result_df.append({'id': str(row['id']), 'title': title, 'link' : link}, ignore_index=True)
driver.close()
################################################
# Extract to Excel File
################################################
result_file_name = 'result'
result_df.to_excel('C:\\Users\\cristoval\\Desktop\\data\\{0}.xlsx'.format(result_file_name), index=False)
Useful Function
import requests
'''
import json
import requests
json_data : data = {'language': "ko", "data": {"name": 'Cristoval', "age": 20}} # dictionary
json_data = json.dumps(data) # convert dictionary to json
url : url
'''
def Post_api(json_data, url):
return requests.post(url, data=json_data)
def List_to_text_file(file_path, file_name, list):
with open(file_path + file_name, "w", encoding='utf-8') as output:
for row in list:
try:
output.write(str(row) + '\n')
except:
print("except :: " + str(row))
continue
'''
import pandas as pd
result_df : columns = {'ko': [], 'en': [], 'result': []}
result_df = pd.DataFrame(data=columns)
# Roop
result_df = result_df.append({'ko': ko_str, 'en': en_str, 'result' : result_str}, ignore_index=True)
'''
def DataFrame_to_Excel(result_df, file_path, export_file_name) :
result_df.to_excel(file_path + '{0}.xlsx'.format(export_file_name), index=False)
Reference
반응형
'Python' 카테고리의 다른 글
[자연어처리] Subword Tokenizer (BPE, SentencePiece, Wordpiece Model) (0) | 2021.10.12 |
---|---|
파이썬 멀티프로세싱 (Python Multiprocessing) (0) | 2021.08.23 |
Wikipedia Data collection & analysis (Wikipedia 국/영문 데이터 수집/분석) (0) | 2021.08.09 |
[NLP] 딥 러닝을 이용한 자연어 처리 입문(Text preprocessing) (0) | 2021.07.17 |
[Python NLP] OpenNMT-py (translate) (0) | 2021.07.05 |
댓글