redplanet
/
feedcake


			
							# This script is intended for personal and scientific use only

import os
import sys
import re
import hashlib
import json
from time import sleep
import feedparser
import requests
from bs4 import BeautifulSoup, Comment
from bs4.element import CData


# default config location is a 'config.json' next to the script.
try:
    filedir = os.path.dirname(os.path.abspath(__file__))
    if len(sys.argv) < 2:
        configpath = filedir+'/config.json'
        print("Using default config location: ", configpath)
        config = json.load(open(configpath))
    else:
        configpath = sys.argv[1]
        config = json.load(open(configpath))

except:
    print("Problem reading config file: ", configpath)
    print("ERROR: Config file not found or invalid!")
    sys.exit(1)

print(filedir)
public_path = filedir + '/public'
assets_path = public_path + '/assets'

# e.g. https://example.com/some-string
base_url = config['base_url']


# "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
requestheaders = {
    'user-'+'age'+'nt' :
    'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
    + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
    + 'og'+ 'le'+'.com/'+'bo'+'t.html)'
}


# need filname safe strings for storing images along html files
def get_valid_filename(s):
    s = str(s).split('?')[0].strip().strip('/').strip('http://').strip('https://').replace(' ', '-')
    return re.sub(r'(?u)[^-\w.]', '-', s)

# Get a unique and valid filename from URL (for images)
def filename_from_url(url):
    # remove get attributes and path
    new_filename = url.split('?')[0].split('/')[-1]
    # Split filename
    new_filename = new_filename.split('.')
    # insert a hash before suffix
    new_filename.insert(1, str(hashlib.md5(url.encode('utf-8')).hexdigest()) )
    # convert back to string and extra validate
    new_filename = get_valid_filename('.'.join(new_filename))
    return new_filename

# Download images and so on
def download_image(url, entry_dir, filename):
    # take care of protocol relative URLs ... let's just assume that https works.
    if url.startswith('//'):
        url = 'https:'+url
    response = requests.get(url, headers=requestheaders)
    if response.status_code == 200:
        with open(assets_path + '/' + entry_dir + '/' + filename, 'wb') as f:
            #f.write(response.content)
            for chunk in response.iter_content(1024):
                f.write(chunk)

def process_feed(feed_url, output_filename):

    # Get the feed
    r_feed = requests.get(feed_url, headers=requestheaders)

    # TODO: exceptions.(what if 404 or whatever?)

    # Store data of new articles
    for entry in feedparser.parse(r_feed.text).entries:
        print(entry.link)
        entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
        entry_path = assets_path + '/'+ entry_dir
        if not os.path.exists(entry_path):
            r = requests.get(entry.link.split('?')[0], headers=requestheaders)

            online_soup = BeautifulSoup(r.text, 'html.parser')

            content_soup = BeautifulSoup('<article></article>', 'html.parser')

            # Remove all Comments
            for element in online_soup(text=lambda text: isinstance(text, Comment)):
                element.extract()

            # domain and path specific rules
            # ... split strings for (very simple) ob+fu+sca+tion

            if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
                if entry.date:
                    article_time = content_soup.new_tag('time', datetime=entry.date)
                    content_soup.article.append(article_time)
                article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'})
                content_soup.article.append(article_headline)
                article_body = online_soup.find('div', attrs={'class': 'story-content'})
                content_soup.article.append(article_body)
                article_link = content_soup.new_tag('a', href=entry.link)
                article_link['class'] = 'source';
                article_link.string = 'Quelle (' + entry.link + ')'
                content_soup.article.append(article_link)

            if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
                print(entry)
                if entry.published:
                    article_time = content_soup.new_tag('time', datetime=entry.published)
                    content_soup.article.append(article_time)
                article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'})
                content_soup.article.append(article_headline)
                # images etc
                article_aside = online_soup.find('div', id="content-aside")
                content_soup.article.append(article_aside)
                article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'})
                content_soup.article.append(article_body)
                article_link = content_soup.new_tag('a', href=entry.link)
                article_link['class'] = 'source';
                article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
                content_soup.article.append(article_link)

                # modify original link -> mobile version and comment section
                link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#'
                article_comments_link = content_soup.new_tag('a', href=link_to_comments)
                article_comments_link['class'] = 'comments';
                article_comments_link.sting = 'Kommentare'
                content_soup.article.append(article_comments_link)

                article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')'
                content_soup.article.append(article_link)


            # create directory for storing and serving html and images
            os.makedirs(entry_path)

            # download all article images and replace image source
            for img in content_soup.findAll('img'):
                print(img)
                if img.get('data-src'):
                    old_url = img['data-src']
                    print(old_url)
                    if not old_url.startswith('data:'):
                        new_filename = filename_from_url(old_url)
                        img['data-src'] = base_url + '/' + entry_dir + '/' + new_filename
                        download_image(old_url, entry_dir, new_filename)
                if img.get('src'):
                    old_url = img['src']
                    print(old_url)
                    if not old_url.startswith('data:'):
                        new_filename = filename_from_url(old_url)
                        img['src'] = base_url + '/' + entry_dir + '/' + new_filename
                        download_image(old_url, entry_dir, new_filename)
                if img.get('data-srcset'):
                    srcset = img['data-srcset'].split(', ')
                    print(old_url)
                    new_srcset = []
                    for src in srcset:
                        old_url = src.split(' ')[0]
                        src_res = src.split(' ')[1]
                        new_filename = filename_from_url(old_url)
                        download_image(old_url, entry_dir, new_filename)
                        new_url = base_url + '/' + entry_dir + '/' + new_filename
                        src = ' '.join([new_url, src_res])
                        new_srcset.append(src)
                    img['data-srcset'] = ', '.join(new_srcset)

            # TODO(?): HTML5 picture tag


            f = open(entry_path + '/index.html', 'w')
            f.write(str(content_soup.prettify()))
            f.close()

            sleep(1.3)


    # Create new feed

    # Maybe buiding a new feed from scretch using a template would be nicer but ...
    # let's just modify the original one!

    feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')

    for e in feed_soup.findAll('item'):
        entry_dir = get_valid_filename(e.link.text)
        f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
        content_tag = feed_soup.new_tag('content:encoded')
        content_tag.string = CData(f_content.read())
        e.append(content_tag)
        f_content.close

    f = open(public_path + '/' + output_filename, 'w')
    f.write(str(feed_soup))
    f.close()


# Let's actually fetch the stuff!

for feed in config['feeds']:
    process_feed(feed['source'], feed['destination'])