diff --git a/config.example.json b/config.example.json index 06af43e..30144b9 100644 --- a/config.example.json +++ b/config.example.json @@ -3,7 +3,8 @@ "feeds" : [ { "source" : "https://a.newspaper.tld/news.xml", - "destination": "newspaper.xml" + "destination": "newspaper.xml", + "exclude": ["10 most", "hate", "?!", "click", "horrific", "will shock you"] }, { "source" : "https://another.newspaper.tld/rss", diff --git a/feedcake.py b/feedcake.py index ebb6087..136ea07 100644 --- a/feedcake.py +++ b/feedcake.py @@ -28,7 +28,6 @@ except: print("ERROR: Config file not found or invalid!") sys.exit(1) -print(filedir) public_path = filedir + '/public' assets_path = public_path + '/assets' feeds_path = public_path + '/feeds' @@ -79,7 +78,11 @@ def download_image(url, entry_dir, filename): for chunk in response.iter_content(1024): f.write(chunk) -def process_feed(feed_url, output_filename): +def process_feed(obj): + feed_url = obj['source'] + output_filename = obj['destination'] + + print('Updating:', obj['destination']) # Get the feed r_feed = requests.get(feed_url, headers=requestheaders) @@ -88,10 +91,10 @@ def process_feed(feed_url, output_filename): # Store data of new articles for entry in feedparser.parse(r_feed.text).entries: - print(entry.link) entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_path = assets_path + '/'+ entry_dir if not os.path.exists(entry_path): + print('New item: ', entry.link) r = requests.get(entry.link.split('?')[0], headers=requestheaders) online_soup = BeautifulSoup(r.text, 'html.parser') @@ -192,6 +195,14 @@ def process_feed(feed_url, output_filename): feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml') + # Exclude items + if obj.get('exclude') and isinstance(obj['exclude'], list): + for e in feed_soup.findAll('item'): + matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()] + if len(matches) > 0: + e.extract() + print('Exclude: ', e.title.text, '->', matches) + for e in feed_soup.findAll('item'): entry_dir = get_valid_filename(e.link.text) f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') @@ -213,4 +224,4 @@ def process_feed(feed_url, output_filename): # Let's actually fetch the stuff! for feed in config['feeds']: - process_feed(feed['source'], feed['destination']) + process_feed(feed)