From 061e57323c5fdb1a316aefecc05083185a43a1a7 Mon Sep 17 00:00:00 2001 From: Andreas Demmelbauer Date: Wed, 3 Apr 2019 17:32:29 -0700 Subject: [PATCH] log timestamps, editable request-headers, README --- README.md | 2 +- config.example.json | 3 +++ feedcake.py | 56 ++++++++++++++++++++------------------------- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 18e372c..1322606 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ news pages * edit `cron.sh` * make `cron.sh` executable: `chmod +x cron.sh` * add cronjob for `cron.sh`: `crontab -e` - * `*/5 * * * * /absolute/path/to/cron.sh > /path/to/logfile 2>&1` + * `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1` * setup your webserver: * let your webserver somehow point to the `feeds` directory. You should protect the http path with a basic authentication. diff --git a/config.example.json b/config.example.json index 30144b9..ab748d5 100644 --- a/config.example.json +++ b/config.example.json @@ -1,5 +1,8 @@ { "assets_url" : "https://yourdomain.tld/some-url", + "request_headers": { + "user-agent" : "Mozilla/5.0 (compatible; Feedcake/0.1 friendly, non-profit newsfeed bot)" + }, "feeds" : [ { "source" : "https://a.newspaper.tld/news.xml", diff --git a/feedcake.py b/feedcake.py index f24a776..9f50931 100644 --- a/feedcake.py +++ b/feedcake.py @@ -5,27 +5,31 @@ import sys import re import hashlib import json -from time import sleep +import time import feedparser import requests from bs4 import BeautifulSoup, Comment from bs4.element import CData +def timestamp(): + return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']' + + # default config location is a 'config.json' next to the script. try: filedir = os.path.dirname(os.path.abspath(__file__)) if len(sys.argv) < 2: configpath = filedir+'/config.json' - print("Using default config location: ", configpath) + print(timestamp(), "Using default config location: ", configpath) config = json.load(open(configpath)) else: configpath = sys.argv[1] config = json.load(open(configpath)) except: - print("Problem reading config file: ", configpath) - print("ERROR: Config file not found or invalid!") + print(timestamp(), "Problem reading config file: ", configpath) + print(timestamp(), "ERROR: Config file not found or invalid!") sys.exit(1) public_path = filedir + '/public' @@ -35,19 +39,7 @@ feeds_path = public_path + '/feeds' # e.g. https://example.com/some-string assets_url = config['assets_url'] - - - -# "I'm a robot which promises you clicks and $ ... Give me ALL your content!" -requestheaders = { - 'user-'+'age'+'nt' : - 'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go' - + 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go' - + 'og'+ 'le'+'.com/'+'bo'+'t.html)' -} - - - +requestheaders = config['request_headers'] # need filname safe strings for storing images along html files def get_valid_filename(s): @@ -82,7 +74,7 @@ def process_feed(obj): feed_url = obj['source'] output_filename = obj['destination'] - print('Updating:', obj['destination']) + print(timestamp(), 'Updating:', obj['destination']) # Get the feed r_feed = requests.get(feed_url, headers=requestheaders) @@ -94,7 +86,7 @@ def process_feed(obj): entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_path = assets_path + '/'+ entry_dir if not os.path.exists(entry_path): - print('New item:', entry.link) + print(timestamp(), 'New item:', entry.link) r = requests.get(entry.link.split('?')[0], headers=requestheaders) online_soup = BeautifulSoup(r.text, 'html.parser') @@ -106,7 +98,7 @@ def process_feed(obj): element.extract() # domain and path specific rules - # ... split strings for (very simple) ob+fu+sca+tion + # ... ob+fu+sca+tion for seo if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): if entry.date: @@ -116,10 +108,10 @@ def process_feed(obj): content_soup.div.append(article_headline) article_body = online_soup.find('div', attrs={'class': 'story-content'}) content_soup.div.append(article_body) - article_link = content_soup.new_tag('a', href=entry.link) - article_link['class'] = 'source' - article_link.string = 'Quelle (' + entry.link + ')' - content_soup.div.append(article_link) + article_source = content_soup.new_tag('a', href=entry.link) + article_source['class'] = 'source' + article_source.string = 'Quelle: ' + entry.link + content_soup.div.append(article_source) if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) if entry.published: @@ -142,10 +134,12 @@ def process_feed(obj): article_comments_p.append(article_comments_link) content_soup.div.append(article_comments_p) - article_link = content_soup.new_tag('a', href=entry.link.split('?')[0]) - article_link['class'] = 'source' - article_link.string = 'Quelle: ' + entry.link.split('?')[0] - content_soup.div.append(article_link) + article_source = content_soup.new_tag('a', href=entry.link.split('?')[0]) + article_source['class'] = 'source' + article_source.string = 'Quelle: ' + entry.link.split('?')[0] + article_source_p = content_soup.new_tag('p') + article_source_p.append(article_source) + content_soup.div.append(article_source_p) # create directory for storing and serving html and images @@ -185,7 +179,7 @@ def process_feed(obj): f.write(str(content_soup)) f.close() - sleep(1.3) + time.sleep(1.3) # Create new feed @@ -201,7 +195,7 @@ def process_feed(obj): matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()] if len(matches) > 0: e.extract() - print('Exclude: ', e.title.text, '->', matches) + print(timestamp(), 'Exclude: ', e.title.text, '->', matches) for e in feed_soup.findAll('item'): entry_dir = get_valid_filename(e.link.text) @@ -215,7 +209,7 @@ def process_feed(obj): os.makedirs(feeds_path, exist_ok=True) f = open(feeds_path + '/' + output_filename, 'w') - print('Done!') + print(timestamp(), 'Done!') f.write(str(feed_soup)) f.close()