Files
TorrentCrawler/crawler.py
2017-06-22 10:38:21 +09:00

148 lines
3.2 KiB
Python

import requests
from bs4 import BeautifulSoup
import re
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
class torrent_file:
def __init__(self):
self.file_name = ""
self.link = ""
def __init__(self, name, link):
self.file_name = name
self.link = link
def __str__(self):
return "{{ {}: {} }}".format(self.file_name, self.link)
def __repr__(self):
return str(self)
class Options:
def __init__(self):
self.urls = [
'https://todaum.com/bbs/board.php?bo_table=torrent_kortv_ent&device=pc&page=',
]
self.max_page = 1
self.keywords = [
'무한도전.*720p.*NEXT',
'한끼줍쇼.*720p.*NEXT',
'라디오스타.*720p.*NEXT'
]
self.downloaded_ep = {
'무한도전': [ 533, ],
'라디오스타': [ 531 ]
}
def save(self):
pass
def load(self):
pass
def crawl_list(url, option):
code = requests.get(url)
html = code.text
soup = BeautifulSoup(html, 'lxml')
links = []
for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'):
print(type(link))
print(link)
href = url+link.get('href')
print('fff : '+href)
# links.append(href)
return links
def crawl_downlink(url):
code = requests.get(url)
html = code.text
soup = BeautifulSoup(html, 'lxml')
links = []
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
for tag in a_tags:
file_name = str(tag.find('strong').text)
links.append(torrent_file(file_name, tag.get('href')))
return links
def download_files(files):
for file in files:
response = requests.get(file.link)
data = response.content
f = open(file.file_name, 'wb')
f.write(data)
f.close()
def print_log(files):
f = open('log.txt', 'at')
for file in files:
f.write(file.file_name+'\n')
f.close()
def crawl(url):
page_links = crawl_list(url)
files = []
for link in page_links:
files += crawl_downlink(link)
print_log(files)
download_files(files)
def do_it():
option = Options()
files = []
for url in option.urls:
for page in range(1, option.max_page+1):
page_url = url+str(page)
page_links = crawl_list(page_url, option)
# for link in page_links:
# files += crawl_downlink(link)
#
# download_files(files)
# crawl(urls[0])
do_it()
# test = [
# '무한도전.E533 170610 720p NEXT',
# '무한도전 E533 170610.720p NEXT',
# '무한도전.E533.170610.720p NEXT.mp4',
# '무한도전.E533.170610.720p NEXT',
# '무한도전.E533.170610.720p NEXT',
# '무한도전 E532 170603 720p NEXT',
# ]
#
# a = []
# # for k in test:
# # a += re.findall(r'[ .,]E([0-9]+)[ .,]', k)
#
# for k in test:
# m=re.search(r"(무한도전)(720p)(.NEXT)", k)
# print(k+': ')
# res = m.groups()
# print(len(res))
# print(res)
# print('')
#
# print(a)