# This script is intended for personal and scientific use only import os import sys import re import hashlib import json from time import sleep import feedparser import requests from bs4 import BeautifulSoup, Comment from bs4.element import CData # default config location is a 'config.json' next to the script. try: filedir = os.path.dirname(os.path.abspath(__file__)) if len(sys.argv) < 2: configpath = filedir+'/config.json' print("Using default config location: ", configpath) config = json.load(open(configpath)) else: configpath = sys.argv[1] config = json.load(open(configpath)) except: print("Problem reading config file: ", configpath) print("ERROR: Config file not found or invalid!") sys.exit(1) print(filedir) public_path = filedir + '/public' assets_path = public_path + '/assets' # e.g. https://example.com/some-string base_url = config['base_url'] # "I'm a robot which promises you clicks and $ ... Give me ALL your content!" requestheaders = { 'user-'+'age'+'nt' : 'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go' + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go' + 'og'+ 'le'+'.com/'+'bo'+'t.html)' } # need filname safe strings for storing images along html files def get_valid_filename(s): s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-') return re.sub(r'(?u)[^-\w.]', '-', s) # Get a unique and valid filename from URL (for images) def filename_from_url(url): # remove get attributes and path new_filename = url.split('?')[0].split('/')[-1] # Split filename new_filename = new_filename.split('.') # insert a hash before suffix new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) ) # convert back to string and extra validate new_filename = get_valid_filename('.'.join(new_filename)) return new_filename # Download images and so on def download_image(url, entry_dir, filename): # take care of protocol relative URLs ... let's just assume that https works. if url.startswith('//'): url = 'https:'+url response = requests.get(url, headers=requestheaders) if response.status_code == 200: with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f: #f.write(response.content) for chunk in response.iter_content(1024): f.write(chunk) def process_feed(feed_url, output_filename): # Get the feed r_feed = requests.get(feed_url, headers=requestheaders) # TODO: exceptions.(what if 404 or whatever?) # Store data of new articles for entry in feedparser.parse(r_feed.text).entries: print(entry.link) entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_path = assets_path + '/'+ entry_dir if not os.path.exists(entry_path): r = requests.get(entry.link.split('?')[0], headers=requestheaders) online_soup = BeautifulSoup(r.text, 'html.parser') content_soup = BeautifulSoup('
', 'html.parser') # Remove all Comments for element in online_soup(text=lambda text: isinstance(text, Comment)): element.extract() # domain and path specific rules # ... split strings for (very simple) ob+fu+sca+tion if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): if entry.date: article_time = content_soup.new_tag('time', datetime=entry.date) content_soup.article.append(article_time) article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'}) content_soup.article.append(article_headline) article_body = online_soup.find('div', attrs={'class': 'story-content'}) content_soup.article.append(article_body) article_link = content_soup.new_tag('a', href=entry.link) article_link['class'] = 'source'; article_link.string = 'Quelle (' + entry.link + ')' content_soup.article.append(article_link) if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) if entry.published: article_time = content_soup.new_tag('time', datetime=entry.published) content_soup.article.append(article_time) article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'}) content_soup.article.append(article_headline) # images etc article_aside = online_soup.find('div', id="content-aside") content_soup.article.append(article_aside) article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'}) content_soup.article.append(article_body) article_link = content_soup.new_tag('a', href=entry.link) article_link['class'] = 'source'; article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' content_soup.article.append(article_link) # modify original link -> mobile version and comment section link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#' article_comments_link = content_soup.new_tag('a', href=link_to_comments) article_comments_link['class'] = 'comments'; article_comments_link.sting = 'Kommentare' content_soup.article.append(article_comments_link) article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' content_soup.article.append(article_link) # create directory for storing and serving html and images os.makedirs(entry_path) # download all article images and replace image source for img in content_soup.findAll('img'): print(img) if img.get('data-src'): old_url = img['data-src'] if not old_url.startswith('data:'): new_filename = filename_from_url(old_url) img['data-src'] = base_url + '/' + entry_dir + '/' + new_filename download_image(old_url, entry_dir, new_filename) if img.get('src'): old_url = img['src'] if not old_url.startswith('data:'): new_filename = filename_from_url(old_url) img['src'] = base_url + '/' + entry_dir + '/' + new_filename download_image(old_url, entry_dir, new_filename) if img.get('data-srcset'): srcset = img['data-srcset'].split(', ') new_srcset = [] for src in srcset: old_url = src.split(' ')[0] src_res = src.split(' ')[1] new_filename = filename_from_url(old_url) download_image(old_url, entry_dir, new_filename) new_url = base_url + '/' + entry_dir + '/' + new_filename src = ' '.join([new_url, src_res]) new_srcset.append(src) img['data-srcset'] = ', '.join(new_srcset) # TODO(?): HTML5 picture tag f = open(entry_path + '/index.html', 'w') f.write(str(content_soup.prettify())) f.close() sleep(1.3) # Create new feed # Maybe buiding a new feed from scretch using a template would be nicer but ... # let's just modify the original one! feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml') for e in feed_soup.findAll('item'): entry_dir = get_valid_filename(e.link.text) f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') content_tag = feed_soup.new_tag('content:encoded') content_tag.string = CData(f_content.read()) e.append(content_tag) f_content.close f = open(public_path + '/' + output_filename, 'w') f.write(str(feed_soup)) f.close() # Let's actually fetch the stuff! for feed in config['feeds']: process_feed(feed['source'], feed['destination'])