|
|
@@ -0,0 +1,217 @@ |
|
|
|
# This script is intended for personal and scientific use only |
|
|
|
|
|
|
|
import os |
|
|
|
import sys |
|
|
|
import re |
|
|
|
import hashlib |
|
|
|
import json |
|
|
|
from time import sleep |
|
|
|
import feedparser |
|
|
|
import requests |
|
|
|
from bs4 import BeautifulSoup, Comment |
|
|
|
from bs4.element import CData |
|
|
|
|
|
|
|
|
|
|
|
# default config location is a 'config.json' next to the script. |
|
|
|
try: |
|
|
|
filedir = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
if len(sys.argv) < 2: |
|
|
|
configpath = filedir+'/config.json' |
|
|
|
print("Using default config location: ", configpath) |
|
|
|
config = json.load(open(configpath)) |
|
|
|
else: |
|
|
|
configpath = sys.argv[1] |
|
|
|
config = json.load(open(configpath)) |
|
|
|
|
|
|
|
except: |
|
|
|
print("Problem reading config file: ", configpath) |
|
|
|
print("ERROR: Config file not found or invalid!") |
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
print(filedir) |
|
|
|
public_path = filedir + '/public' |
|
|
|
assets_path = public_path + '/assets' |
|
|
|
|
|
|
|
# e.g. https://example.com/some-string |
|
|
|
base_url = config['base_url'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# "I'm a robot which promises you clicks and $ ... Give me ALL your content!" |
|
|
|
requestheaders = { |
|
|
|
'user-'+'age'+'nt' : |
|
|
|
'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go' |
|
|
|
+ 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go' |
|
|
|
+ 'og'+ 'le'+'.com/'+'bo'+'t.html)' |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# need filname safe strings for storing images along html files |
|
|
|
def get_valid_filename(s): |
|
|
|
s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-') |
|
|
|
return re.sub(r'(?u)[^-\w.]', '-', s) |
|
|
|
|
|
|
|
# Get a unique and valid filename from URL (for images) |
|
|
|
def filename_from_url(url): |
|
|
|
# remove get attributes and path |
|
|
|
new_filename = url.split('?')[0].split('/')[-1] |
|
|
|
# Split filename |
|
|
|
new_filename = new_filename.split('.') |
|
|
|
# insert a hash before suffix |
|
|
|
new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) ) |
|
|
|
# convert back to string and extra validate |
|
|
|
new_filename = get_valid_filename('.'.join(new_filename)) |
|
|
|
return new_filename |
|
|
|
|
|
|
|
# Download images and so on |
|
|
|
def download_image(url, entry_dir, filename): |
|
|
|
# take care of protocol relative URLs ... let's just assume that https works. |
|
|
|
if url.startswith('//'): |
|
|
|
url = 'https:'+url |
|
|
|
response = requests.get(url, headers=requestheaders) |
|
|
|
if response.status_code == 200: |
|
|
|
with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f: |
|
|
|
#f.write(response.content) |
|
|
|
for chunk in response.iter_content(1024): |
|
|
|
f.write(chunk) |
|
|
|
|
|
|
|
def process_feed(feed_url, output_filename): |
|
|
|
|
|
|
|
# Get the feed |
|
|
|
r_feed = requests.get(feed_url, headers=requestheaders) |
|
|
|
|
|
|
|
# TODO: exceptions.(what if 404 or whatever?) |
|
|
|
|
|
|
|
# Store data of new articles |
|
|
|
for entry in feedparser.parse(r_feed.text).entries: |
|
|
|
print(entry.link) |
|
|
|
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ |
|
|
|
entry_path = assets_path + '/'+ entry_dir |
|
|
|
if not os.path.exists(entry_path): |
|
|
|
r = requests.get(entry.link.split('?')[0], headers=requestheaders) |
|
|
|
|
|
|
|
online_soup = BeautifulSoup(r.text, 'html.parser') |
|
|
|
|
|
|
|
content_soup = BeautifulSoup('<article></article>', 'html.parser') |
|
|
|
|
|
|
|
# Remove all Comments |
|
|
|
for element in online_soup(text=lambda text: isinstance(text, Comment)): |
|
|
|
element.extract() |
|
|
|
|
|
|
|
# domain and path specific rules |
|
|
|
# ... split strings for (very simple) ob+fu+sca+tion |
|
|
|
|
|
|
|
if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): |
|
|
|
if entry.date: |
|
|
|
article_time = content_soup.new_tag('time', datetime=entry.date) |
|
|
|
content_soup.article.append(article_time) |
|
|
|
article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'}) |
|
|
|
content_soup.article.append(article_headline) |
|
|
|
article_body = online_soup.find('div', attrs={'class': 'story-content'}) |
|
|
|
content_soup.article.append(article_body) |
|
|
|
article_link = content_soup.new_tag('a', href=entry.link) |
|
|
|
article_link['class'] = 'source'; |
|
|
|
article_link.string = 'Quelle (' + entry.link + ')' |
|
|
|
content_soup.article.append(article_link) |
|
|
|
|
|
|
|
if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) |
|
|
|
print(entry) |
|
|
|
if entry.published: |
|
|
|
article_time = content_soup.new_tag('time', datetime=entry.published) |
|
|
|
content_soup.article.append(article_time) |
|
|
|
article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'}) |
|
|
|
content_soup.article.append(article_headline) |
|
|
|
# images etc |
|
|
|
article_aside = online_soup.find('div', id="content-aside") |
|
|
|
content_soup.article.append(article_aside) |
|
|
|
article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'}) |
|
|
|
content_soup.article.append(article_body) |
|
|
|
article_link = content_soup.new_tag('a', href=entry.link) |
|
|
|
article_link['class'] = 'source'; |
|
|
|
article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' |
|
|
|
content_soup.article.append(article_link) |
|
|
|
|
|
|
|
# modify original link -> mobile version and comment section |
|
|
|
link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#' |
|
|
|
article_comments_link = content_soup.new_tag('a', href=link_to_comments) |
|
|
|
article_comments_link['class'] = 'comments'; |
|
|
|
article_comments_link.sting = 'Kommentare' |
|
|
|
content_soup.article.append(article_comments_link) |
|
|
|
|
|
|
|
article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' |
|
|
|
content_soup.article.append(article_link) |
|
|
|
|
|
|
|
|
|
|
|
# create directory for storing and serving html and images |
|
|
|
os.makedirs(entry_path) |
|
|
|
|
|
|
|
# download all article images and replace image source |
|
|
|
for img in content_soup.findAll('img'): |
|
|
|
print(img) |
|
|
|
if img.get('data-src'): |
|
|
|
old_url = img['data-src'] |
|
|
|
print(old_url) |
|
|
|
if not old_url.startswith('data:'): |
|
|
|
new_filename = filename_from_url(old_url) |
|
|
|
img['data-src'] = base_url + '/' + entry_dir + '/' + new_filename |
|
|
|
download_image(old_url, entry_dir, new_filename) |
|
|
|
if img.get('src'): |
|
|
|
old_url = img['src'] |
|
|
|
print(old_url) |
|
|
|
if not old_url.startswith('data:'): |
|
|
|
new_filename = filename_from_url(old_url) |
|
|
|
img['src'] = base_url + '/' + entry_dir + '/' + new_filename |
|
|
|
download_image(old_url, entry_dir, new_filename) |
|
|
|
if img.get('data-srcset'): |
|
|
|
srcset = img['data-srcset'].split(', ') |
|
|
|
print(old_url) |
|
|
|
new_srcset = [] |
|
|
|
for src in srcset: |
|
|
|
old_url = src.split(' ')[0] |
|
|
|
src_res = src.split(' ')[1] |
|
|
|
new_filename = filename_from_url(old_url) |
|
|
|
download_image(old_url, entry_dir, new_filename) |
|
|
|
new_url = base_url + '/' + entry_dir + '/' + new_filename |
|
|
|
src = ' '.join([new_url, src_res]) |
|
|
|
new_srcset.append(src) |
|
|
|
img['data-srcset'] = ', '.join(new_srcset) |
|
|
|
|
|
|
|
# TODO(?): HTML5 picture tag |
|
|
|
|
|
|
|
|
|
|
|
f = open(entry_path + '/index.html', 'w') |
|
|
|
f.write(str(content_soup.prettify())) |
|
|
|
f.close() |
|
|
|
|
|
|
|
sleep(1.3) |
|
|
|
|
|
|
|
|
|
|
|
# Create new feed |
|
|
|
|
|
|
|
# Maybe buiding a new feed from scretch using a template would be nicer but ... |
|
|
|
# let's just modify the original one! |
|
|
|
|
|
|
|
feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml') |
|
|
|
|
|
|
|
for e in feed_soup.findAll('item'): |
|
|
|
entry_dir = get_valid_filename(e.link.text) |
|
|
|
f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') |
|
|
|
content_tag = feed_soup.new_tag('content:encoded') |
|
|
|
content_tag.string = CData(f_content.read()) |
|
|
|
e.append(content_tag) |
|
|
|
f_content.close |
|
|
|
|
|
|
|
f = open(public_path + '/' + output_filename, 'w') |
|
|
|
f.write(str(feed_soup)) |
|
|
|
f.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Let's actually fetch the stuff! |
|
|
|
|
|
|
|
for feed in config['feeds']: |
|
|
|
process_feed(feed['source'], feed['destination']) |