commit 7e0a95212c137cae66a2eda99d8845453276e36d Author: Andreas Demmelbauer Date: Tue Apr 2 12:39:07 2019 -0700 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d344ba6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +config.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..76570b8 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +# Feedcake +## "Gib mir ein Stück Kuchen und ich will den ganzen cake." + +### The Problem +Most news platforms don't give you the full article via rss/atom. +This wouldn't be a big problem. But some of them do crazy 1984-ish stuff on their +websites or they have built up paywalls for users with privacy addons. + +### Goal of this script +Getting a full-featured news feed (full articles with images) from various +news pages + +### Benefits for the user +* They don't need to go on the website to read the articles +* No ads +* No tracking + +### Possible downsides for the user +* articles don't get updated once they are scraped +* articles arrive with some delay +* interactive/special elements in articles may not work + +### What it does +* Fetching the news feed from the original website +* scrape contents of new entries and save them into a directory structure +* save a full featured RSS file + +### ... and what it doesn't +* Managing when it scrapes (use crontab or sth else for that) +* serving the feeds and assets via HTTPS (use your favorite web server for that) +* Dealing with article comments +* Archiving feeds (But content and assets - but without meta data) +* Using some sort of database (the file structure is everything) +* Cleaning up old assets +* Automaticly updating the basedir if it changed. + +### Ugly stuff? +* the html files (feed content) get stored along the assets, even if they don't + need to be exploited via HTTPS. + +### How to use +* git clone this project and enter directory +* install python3, pip and virtualenv +* Create virtualenv: `virtualenv -p python3 ~/.virtualenvs/feedcake` +* Activate your new virtualenv: `source ~/.virtualenvs/feedcake/bin/activate` +* switch into the projects directory: `cd feedcake` +* Install requirements: `pip3 install -r requirements.txt` +* copy the config-example: `cp config-example.json config.json`. +* edit `config.json` +* copy the cron-example: `cp cron-example.sh cron.sh`. +* edit `cron.sh` +* add cronjob for `cron.sh`: `crontab -e` + * `*/5 * * * * /absolute/path/to/cron.sh > /path/to/logfile 2>&1` +* setup your webserver: the `base_url` must point to the `public` directory + You should add basic http authentication or at least keep the url private +* After running the script the first time, your desired feed is available at + `base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`) + +### TODOs +* Decide what should happen with old news articles and assets which are not + listed in the current feed anymore. diff --git a/config.example.json b/config.example.json new file mode 100644 index 0000000..e3a42c1 --- /dev/null +++ b/config.example.json @@ -0,0 +1,13 @@ +{ + "base_url" : "https://yourdomain.tld/some-url", + "feeds" : [ + { + "source" : "https://a.newspaper.tld/news.xml", + "destination": "newspaper.xml" + }, + { + "source" : "https://another.newspaper.tld/rss", + "destination": "another-newspaper.xml" + } + ] +} diff --git a/feedcake.py b/feedcake.py new file mode 100644 index 0000000..6a18945 --- /dev/null +++ b/feedcake.py @@ -0,0 +1,217 @@ +# This script is intended for personal and scientific use only + +import os +import sys +import re +import hashlib +import json +from time import sleep +import feedparser +import requests +from bs4 import BeautifulSoup, Comment +from bs4.element import CData + + +# default config location is a 'config.json' next to the script. +try: + filedir = os.path.dirname(os.path.abspath(__file__)) + if len(sys.argv) < 2: + configpath = filedir+'/config.json' + print("Using default config location: ", configpath) + config = json.load(open(configpath)) + else: + configpath = sys.argv[1] + config = json.load(open(configpath)) + +except: + print("Problem reading config file: ", configpath) + print("ERROR: Config file not found or invalid!") + sys.exit(1) + +print(filedir) +public_path = filedir + '/public' +assets_path = public_path + '/assets' + +# e.g. https://example.com/some-string +base_url = config['base_url'] + + + + +# "I'm a robot which promises you clicks and $ ... Give me ALL your content!" +requestheaders = { + 'user-'+'age'+'nt' : + 'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go' + + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go' + + 'og'+ 'le'+'.com/'+'bo'+'t.html)' +} + + + + +# need filname safe strings for storing images along html files +def get_valid_filename(s): + s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-') + return re.sub(r'(?u)[^-\w.]', '-', s) + +# Get a unique and valid filename from URL (for images) +def filename_from_url(url): + # remove get attributes and path + new_filename = url.split('?')[0].split('/')[-1] + # Split filename + new_filename = new_filename.split('.') + # insert a hash before suffix + new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) ) + # convert back to string and extra validate + new_filename = get_valid_filename('.'.join(new_filename)) + return new_filename + +# Download images and so on +def download_image(url, entry_dir, filename): + # take care of protocol relative URLs ... let's just assume that https works. + if url.startswith('//'): + url = 'https:'+url + response = requests.get(url, headers=requestheaders) + if response.status_code == 200: + with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f: + #f.write(response.content) + for chunk in response.iter_content(1024): + f.write(chunk) + +def process_feed(feed_url, output_filename): + + # Get the feed + r_feed = requests.get(feed_url, headers=requestheaders) + + # TODO: exceptions.(what if 404 or whatever?) + + # Store data of new articles + for entry in feedparser.parse(r_feed.text).entries: + print(entry.link) + entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ + entry_path = assets_path + '/'+ entry_dir + if not os.path.exists(entry_path): + r = requests.get(entry.link.split('?')[0], headers=requestheaders) + + online_soup = BeautifulSoup(r.text, 'html.parser') + + content_soup = BeautifulSoup('
', 'html.parser') + + # Remove all Comments + for element in online_soup(text=lambda text: isinstance(text, Comment)): + element.extract() + + # domain and path specific rules + # ... split strings for (very simple) ob+fu+sca+tion + + if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): + if entry.date: + article_time = content_soup.new_tag('time', datetime=entry.date) + content_soup.article.append(article_time) + article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'}) + content_soup.article.append(article_headline) + article_body = online_soup.find('div', attrs={'class': 'story-content'}) + content_soup.article.append(article_body) + article_link = content_soup.new_tag('a', href=entry.link) + article_link['class'] = 'source'; + article_link.string = 'Quelle (' + entry.link + ')' + content_soup.article.append(article_link) + + if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) + print(entry) + if entry.published: + article_time = content_soup.new_tag('time', datetime=entry.published) + content_soup.article.append(article_time) + article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'}) + content_soup.article.append(article_headline) + # images etc + article_aside = online_soup.find('div', id="content-aside") + content_soup.article.append(article_aside) + article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'}) + content_soup.article.append(article_body) + article_link = content_soup.new_tag('a', href=entry.link) + article_link['class'] = 'source'; + article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' + content_soup.article.append(article_link) + + # modify original link -> mobile version and comment section + link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#' + article_comments_link = content_soup.new_tag('a', href=link_to_comments) + article_comments_link['class'] = 'comments'; + article_comments_link.sting = 'Kommentare' + content_soup.article.append(article_comments_link) + + article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' + content_soup.article.append(article_link) + + + # create directory for storing and serving html and images + os.makedirs(entry_path) + + # download all article images and replace image source + for img in content_soup.findAll('img'): + print(img) + if img.get('data-src'): + old_url = img['data-src'] + print(old_url) + if not old_url.startswith('data:'): + new_filename = filename_from_url(old_url) + img['data-src'] = base_url + '/' + entry_dir + '/' + new_filename + download_image(old_url, entry_dir, new_filename) + if img.get('src'): + old_url = img['src'] + print(old_url) + if not old_url.startswith('data:'): + new_filename = filename_from_url(old_url) + img['src'] = base_url + '/' + entry_dir + '/' + new_filename + download_image(old_url, entry_dir, new_filename) + if img.get('data-srcset'): + srcset = img['data-srcset'].split(', ') + print(old_url) + new_srcset = [] + for src in srcset: + old_url = src.split(' ')[0] + src_res = src.split(' ')[1] + new_filename = filename_from_url(old_url) + download_image(old_url, entry_dir, new_filename) + new_url = base_url + '/' + entry_dir + '/' + new_filename + src = ' '.join([new_url, src_res]) + new_srcset.append(src) + img['data-srcset'] = ', '.join(new_srcset) + + # TODO(?): HTML5 picture tag + + + f = open(entry_path + '/index.html', 'w') + f.write(str(content_soup.prettify())) + f.close() + + sleep(1.3) + + + # Create new feed + + # Maybe buiding a new feed from scretch using a template would be nicer but ... + # let's just modify the original one! + + feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml') + + for e in feed_soup.findAll('item'): + entry_dir = get_valid_filename(e.link.text) + f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') + content_tag = feed_soup.new_tag('content:encoded') + content_tag.string = CData(f_content.read()) + e.append(content_tag) + f_content.close + + f = open(public_path + '/' + output_filename, 'w') + f.write(str(feed_soup)) + f.close() + + + + +# Let's actually fetch the stuff! + +for feed in config['feeds']: + process_feed(feed['source'], feed['destination']) diff --git a/public/.gitignore b/public/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/public/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3567009 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4==4.7.1 +feedparser==5.2.1 +requests==2.20.0 +lxml==4.3.3