initial commit

6 years ago · 7e0a95212c
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 config.json
--- a/README.md
+++ b/README.md
@@ -0,0 +1,61 @@
 # Feedcake
 ## "Gib mir ein Stück Kuchen und ich will den ganzen cake."

 ### The Problem
 Most news platforms don't give you the full article via rss/atom.  
 This wouldn't be a big problem. But some of them do crazy 1984-ish stuff on their
 websites or they have built up paywalls for users with privacy addons.

 ### Goal of this script
 Getting a full-featured news feed (full articles with images) from various
 news pages

 ### Benefits for the user
 * They don't need to go on the website to read the articles
 * No ads
 * No tracking

 ### Possible downsides for the user
 * articles don't get updated once they are scraped
 * articles arrive with some delay
 * interactive/special elements in articles may not work

 ### What it does
 * Fetching the news feed from the original website
 * scrape contents of new entries and save them into a directory structure
 * save a full featured RSS file

 ### ... and what it doesn't
 * Managing when it scrapes (use crontab or sth else for that)
 * serving the feeds and assets via HTTPS (use your favorite web server for that)
 * Dealing with article comments
 * Archiving feeds (But content and assets - but without meta data)
 * Using some sort of database (the file structure is everything)
 * Cleaning up old assets
 * Automaticly updating the basedir if it changed.

 ### Ugly stuff?
 * the html files (feed content) get stored along the assets, even if they don't
  need to be exploited via HTTPS.

 ### How to use
 * git clone this project and enter directory
 * install python3, pip and virtualenv
 * Create virtualenv: `virtualenv -p python3 ~/.virtualenvs/feedcake`
 * Activate your new virtualenv: `source ~/.virtualenvs/feedcake/bin/activate`
 * switch into the projects directory: `cd feedcake`
 * Install requirements: `pip3 install -r requirements.txt`
 * copy the config-example: `cp config-example.json config.json`.
 * edit `config.json`
 * copy the cron-example: `cp cron-example.sh cron.sh`.
 * edit `cron.sh`
 * add cronjob for `cron.sh`: `crontab -e`
  * `*/5 * * * * /absolute/path/to/cron.sh  > /path/to/logfile 2>&1`
 * setup your webserver: the `base_url` must point to the `public` directory  
  You should add basic http authentication or at least keep the url private
 * After running the script the first time, your desired feed is available at
  `base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`)

 ### TODOs
 * Decide what should happen with old news articles and assets which are not
  listed in the current feed anymore.
--- a/config.example.json
+++ b/config.example.json
@@ -0,0 +1,13 @@
 {
  "base_url" : "https://yourdomain.tld/some-url",
  "feeds" : [
    {
      "source" : "https://a.newspaper.tld/news.xml",
      "destination": "newspaper.xml"
    },
    {
      "source" : "https://another.newspaper.tld/rss",
      "destination": "another-newspaper.xml"
    }
  ]
 }
--- a/feedcake.py
+++ b/feedcake.py
@@ -0,0 +1,217 @@
 # This script is intended for personal and scientific use only

 import os
 import sys
 import re
 import hashlib
 import json
 from time import sleep
 import feedparser
 import requests
 from bs4 import BeautifulSoup, Comment
 from bs4.element import CData


 # default config location is a 'config.json' next to the script.
 try:
    filedir = os.path.dirname(os.path.abspath(__file__))
    if len(sys.argv) < 2:
        configpath = filedir+'/config.json'
        print("Using default config location: ", configpath)
        config = json.load(open(configpath))
    else:
        configpath = sys.argv[1]
        config = json.load(open(configpath))

 except:
    print("Problem reading config file: ", configpath)
    print("ERROR: Config file not found or invalid!")
    sys.exit(1)

 print(filedir)
 public_path = filedir + '/public'
 assets_path = public_path + '/assets'

 # e.g. https://example.com/some-string
 base_url = config['base_url']




 # "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
 requestheaders = {
    'user-'+'age'+'nt' :
    'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
    + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
    + 'og'+ 'le'+'.com/'+'bo'+'t.html)'
 }




 # need filname safe strings for storing images along html files
 def get_valid_filename(s):
    s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-')
    return re.sub(r'(?u)[^-\w.]', '-', s)

 # Get a unique and valid filename from URL (for images)
 def filename_from_url(url):
    # remove get attributes and path
    new_filename = url.split('?')[0].split('/')[-1]
    # Split filename
    new_filename = new_filename.split('.')
    # insert a hash before suffix
    new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) )
    # convert back to string and extra validate
    new_filename = get_valid_filename('.'.join(new_filename))
    return new_filename

 # Download images and so on
 def download_image(url, entry_dir, filename):
    # take care of protocol relative URLs ... let's just assume that https works.
    if url.startswith('//'):
        url = 'https:'+url
    response = requests.get(url, headers=requestheaders)
    if response.status_code == 200:
        with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f:
            #f.write(response.content)
            for chunk in response.iter_content(1024):
                f.write(chunk)

 def process_feed(feed_url, output_filename):

    # Get the feed
    r_feed = requests.get(feed_url, headers=requestheaders)

    # TODO: exceptions.(what if 404 or whatever?)

    # Store data of new articles
    for entry in feedparser.parse(r_feed.text).entries:
        print(entry.link)
        entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
        entry_path = assets_path + '/'+ entry_dir
        if not os.path.exists(entry_path):
            r = requests.get(entry.link.split('?')[0], headers=requestheaders)

            online_soup = BeautifulSoup(r.text, 'html.parser')

            content_soup = BeautifulSoup('<article></article>', 'html.parser')

            # Remove all Comments
            for element in online_soup(text=lambda text: isinstance(text, Comment)):
                element.extract()

            # domain and path specific rules
            # ... split strings for (very simple) ob+fu+sca+tion

            if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
                if entry.date:
                    article_time = content_soup.new_tag('time', datetime=entry.date)
                    content_soup.article.append(article_time)
                article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'})
                content_soup.article.append(article_headline)
                article_body = online_soup.find('div', attrs={'class': 'story-content'})
                content_soup.article.append(article_body)
                article_link = content_soup.new_tag('a', href=entry.link)
                article_link['class'] = 'source';
                article_link.string = 'Quelle (' + entry.link + ')'
                content_soup.article.append(article_link)

            if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
                print(entry)
                if entry.published:
                    article_time = content_soup.new_tag('time', datetime=entry.published)
                    content_soup.article.append(article_time)
                article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'})
                content_soup.article.append(article_headline)
                # images etc
                article_aside = online_soup.find('div', id="content-aside")
                content_soup.article.append(article_aside)
                article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'})
                content_soup.article.append(article_body)
                article_link = content_soup.new_tag('a', href=entry.link)
                article_link['class'] = 'source';
                article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
                content_soup.article.append(article_link)

                # modify original link -> mobile version and comment section
                link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#'
                article_comments_link = content_soup.new_tag('a', href=link_to_comments)
                article_comments_link['class'] = 'comments';
                article_comments_link.sting = 'Kommentare'
                content_soup.article.append(article_comments_link)

                article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
                content_soup.article.append(article_link)


            # create directory for storing and serving html and images
            os.makedirs(entry_path)

            # download all article images and replace image source
            for img in content_soup.findAll('img'):
                print(img)
                if img.get('data-src'):
                    old_url = img['data-src']
                    print(old_url)
                    if not old_url.startswith('data:'):
                        new_filename = filename_from_url(old_url)
                        img['data-src'] = base_url + '/' + entry_dir + '/' + new_filename
                        download_image(old_url, entry_dir, new_filename)
                if img.get('src'):
                    old_url = img['src']
                    print(old_url)
                    if not old_url.startswith('data:'):
                        new_filename = filename_from_url(old_url)
                        img['src'] = base_url + '/' + entry_dir + '/' + new_filename
                        download_image(old_url, entry_dir, new_filename)
                if img.get('data-srcset'):
                    srcset = img['data-srcset'].split(', ')
                    print(old_url)
                    new_srcset = []
                    for src in srcset:
                        old_url = src.split(' ')[0]
                        src_res = src.split(' ')[1]
                        new_filename = filename_from_url(old_url)
                        download_image(old_url, entry_dir, new_filename)
                        new_url = base_url + '/' + entry_dir + '/' + new_filename
                        src = ' '.join([new_url, src_res])
                        new_srcset.append(src)
                    img['data-srcset'] = ', '.join(new_srcset)

            # TODO(?): HTML5 picture tag


            f = open(entry_path + '/index.html', 'w')
            f.write(str(content_soup.prettify()))
            f.close()

            sleep(1.3)


    # Create new feed

    # Maybe buiding a new feed from scretch using a template would be nicer but ...
    # let's just modify the original one!

    feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')

    for e in feed_soup.findAll('item'):
        entry_dir = get_valid_filename(e.link.text)
        f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
        content_tag = feed_soup.new_tag('content:encoded')
        content_tag.string = CData(f_content.read())
        e.append(content_tag)
        f_content.close

    f = open(public_path + '/' + output_filename, 'w')
    f.write(str(feed_soup))
    f.close()




 # Let's actually fetch the stuff!

 for feed in config['feeds']:
    process_feed(feed['source'], feed['destination'])
--- a/public/.gitignore
+++ b/public/.gitignore
@@ -0,0 +1,2 @@
 *
 !.gitignore
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
 beautifulsoup4==4.7.1
 feedparser==5.2.1
 requests==2.20.0
 lxml==4.3.3