# This script is intended for personal and scientific use only import os import sys import re import hashlib import json import time import feedparser import requests from bs4 import BeautifulSoup, Comment from bs4.element import CData def timestamp(): return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']' # default config location is a 'config.json' next to the script. try: filedir = os.path.dirname(os.path.abspath(__file__)) if len(sys.argv) < 2: configpath = filedir+'/config.json' print(timestamp(), "Using default config location: ", configpath) config = json.load(open(configpath)) else: configpath = sys.argv[1] config = json.load(open(configpath)) except: print(timestamp(), "Problem reading config file: ", configpath) print(timestamp(), "ERROR: Config file not found or invalid!") sys.exit(1) # File location of feeds and assets are in directories next to the script public_path = filedir + '/public' assets_path = public_path + '/assets' feeds_path = public_path + '/feeds' # e.g. https://example.com/some-string assets_url = config['assets_url'] requestheaders = config['request_headers'] # need filname safe strings for storing images along html files def get_valid_filename(s): s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-') return re.sub(r'(?u)[^-\w.]', '-', s) # Get a unique and valid filename from URL (for images) def filename_from_url(url): # remove get attributes and path new_filename = url.split('?')[0].split('/')[-1] # Split filename new_filename = new_filename.split('.') # insert a hash before suffix new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) ) # convert back to string and extra validate new_filename = get_valid_filename('.'.join(new_filename)) return new_filename # Download images and so on def download_image(url, entry_dir, filename): # take care of protocol relative URLs ... let's just assume that https works. if url.startswith('//'): url = 'https:'+url response = requests.get(url, headers=requestheaders) if response.status_code == 200: with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f: #f.write(response.content) for chunk in response.iter_content(1024): f.write(chunk) # process the feed entries, specified in the config def process_feed(obj): feed_url = obj['source'] output_filename = obj['destination'] print(timestamp(), 'Updating:', obj['destination']) # Step 1: Get the feed r_feed = requests.get(feed_url, headers=requestheaders) # TODO: exceptions.(what if 404 or whatever?) # Step 2: Scrape and store data of new articles for entry in feedparser.parse(r_feed.text).entries: entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_path = assets_path + '/'+ entry_dir if not os.path.exists(entry_path): print(timestamp(), 'New item:', entry.link) r = requests.get(entry.link.split('?')[0], headers=requestheaders) online_soup = BeautifulSoup(r.text, 'html.parser') content_soup = BeautifulSoup('
', 'html.parser') # Remove all Comments for element in online_soup(text=lambda text: isinstance(text, Comment)): element.extract() # domain and path specific rules # ... ob+fu+sca+tion for seo if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): if entry.date: article_time = content_soup.new_tag('time', datetime=entry.date) content_soup.div.append(article_time) article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'}) content_soup.div.append(article_headline) article_body = online_soup.find('div', attrs={'class': 'story-content'}) content_soup.div.append(article_body) # Add a link to original article article_source = content_soup.new_tag('a', href=entry.link) article_source['class'] = 'source' article_source.string = 'Quelle: ' + entry.link content_soup.div.append(article_source) if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) if entry.published: article_time = content_soup.new_tag('time', datetime=entry.published) content_soup.div.append(article_time) article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'}) content_soup.div.append(article_headline) # images etc article_aside = online_soup.find('div', id="content-aside") content_soup.div.append(article_aside) article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'}) content_soup.div.append(article_body) # Add a link to comments # modify original link -> mobile version and comment section link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#' article_comments_link = content_soup.new_tag('a', href=link_to_comments) article_comments_link['class'] = 'comments' article_comments_p = content_soup.new_tag('p') article_comments_link.string = 'Kommentare' article_comments_p.append(article_comments_link) content_soup.div.append(article_comments_p) # Add a link to original article article_source = content_soup.new_tag('a', href=entry.link.split('?')[0]) article_source['class'] = 'source' article_source.string = 'Quelle: ' + entry.link.split('?')[0] article_source_p = content_soup.new_tag('p') article_source_p.append(article_source) content_soup.div.append(article_source_p) # create directory for storing and serving html and images os.makedirs(entry_path) # download all article images and replace image source for img in content_soup.findAll('img'): if img.get('data-src'): old_url = img['data-src'] if not old_url.startswith('data:'): new_filename = filename_from_url(old_url) img['data-src'] = assets_url + '/' + entry_dir + '/' + new_filename download_image(old_url, entry_dir, new_filename) if img.get('src'): old_url = img['src'] if not old_url.startswith('data:'): new_filename = filename_from_url(old_url) img['src'] = assets_url + '/' + entry_dir + '/' + new_filename download_image(old_url, entry_dir, new_filename) if img.get('data-srcset'): srcset = img['data-srcset'].split(', ') new_srcset = [] for src in srcset: old_url = src.split(' ')[0] src_res = src.split(' ')[1] new_filename = filename_from_url(old_url) download_image(old_url, entry_dir, new_filename) new_url = assets_url + '/' + entry_dir + '/' + new_filename src = ' '.join([new_url, src_res]) new_srcset.append(src) img['data-srcset'] = ', '.join(new_srcset) # TODO(?): HTML5 picture tag # Save HTML f = open(entry_path + '/index.html', 'w') f.write(str(content_soup)) f.close() # Wait a bit time.sleep(1.3) # Step 3: Create 'new' feed # Maybe buiding a new feed from scretch using a template would be nicer but ... # let's just modify the original one! feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml') # Exclude items if obj.get('exclude') and isinstance(obj['exclude'], list): for e in feed_soup.findAll('item'): matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()] if len(matches) > 0: e.extract() print(timestamp(), 'Exclude: ', e.title.text, '->', matches) # Add content for e in feed_soup.findAll('item'): entry_dir = get_valid_filename(e.link.text) f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') content_tag = feed_soup.new_tag('content:encoded') content_tag.string = CData(f_content.read()) e.append(content_tag) f_content.close # create directory if not present os.makedirs(feeds_path, exist_ok=True) f = open(feeds_path + '/' + output_filename, 'w') print(timestamp(), 'Done!') f.write(str(feed_soup)) f.close() # Let's actually fetch the stuff! for feed in config['feeds']: process_feed(feed)