|
@@ -5,27 +5,31 @@ import sys |
|
|
import re |
|
|
import re |
|
|
import hashlib |
|
|
import hashlib |
|
|
import json |
|
|
import json |
|
|
from time import sleep |
|
|
|
|
|
|
|
|
import time |
|
|
import feedparser |
|
|
import feedparser |
|
|
import requests |
|
|
import requests |
|
|
from bs4 import BeautifulSoup, Comment |
|
|
from bs4 import BeautifulSoup, Comment |
|
|
from bs4.element import CData |
|
|
from bs4.element import CData |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def timestamp(): |
|
|
|
|
|
return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# default config location is a 'config.json' next to the script. |
|
|
# default config location is a 'config.json' next to the script. |
|
|
try: |
|
|
try: |
|
|
filedir = os.path.dirname(os.path.abspath(__file__)) |
|
|
filedir = os.path.dirname(os.path.abspath(__file__)) |
|
|
if len(sys.argv) < 2: |
|
|
if len(sys.argv) < 2: |
|
|
configpath = filedir+'/config.json' |
|
|
configpath = filedir+'/config.json' |
|
|
print("Using default config location: ", configpath) |
|
|
|
|
|
|
|
|
print(timestamp(), "Using default config location: ", configpath) |
|
|
config = json.load(open(configpath)) |
|
|
config = json.load(open(configpath)) |
|
|
else: |
|
|
else: |
|
|
configpath = sys.argv[1] |
|
|
configpath = sys.argv[1] |
|
|
config = json.load(open(configpath)) |
|
|
config = json.load(open(configpath)) |
|
|
|
|
|
|
|
|
except: |
|
|
except: |
|
|
print("Problem reading config file: ", configpath) |
|
|
|
|
|
print("ERROR: Config file not found or invalid!") |
|
|
|
|
|
|
|
|
print(timestamp(), "Problem reading config file: ", configpath) |
|
|
|
|
|
print(timestamp(), "ERROR: Config file not found or invalid!") |
|
|
sys.exit(1) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
public_path = filedir + '/public' |
|
|
public_path = filedir + '/public' |
|
@@ -35,19 +39,7 @@ feeds_path = public_path + '/feeds' |
|
|
# e.g. https://example.com/some-string |
|
|
# e.g. https://example.com/some-string |
|
|
assets_url = config['assets_url'] |
|
|
assets_url = config['assets_url'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# "I'm a robot which promises you clicks and $ ... Give me ALL your content!" |
|
|
|
|
|
requestheaders = { |
|
|
|
|
|
'user-'+'age'+'nt' : |
|
|
|
|
|
'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go' |
|
|
|
|
|
+ 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go' |
|
|
|
|
|
+ 'og'+ 'le'+'.com/'+'bo'+'t.html)' |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requestheaders = config['request_headers'] |
|
|
|
|
|
|
|
|
# need filname safe strings for storing images along html files |
|
|
# need filname safe strings for storing images along html files |
|
|
def get_valid_filename(s): |
|
|
def get_valid_filename(s): |
|
@@ -82,7 +74,7 @@ def process_feed(obj): |
|
|
feed_url = obj['source'] |
|
|
feed_url = obj['source'] |
|
|
output_filename = obj['destination'] |
|
|
output_filename = obj['destination'] |
|
|
|
|
|
|
|
|
print('Updating:', obj['destination']) |
|
|
|
|
|
|
|
|
print(timestamp(), 'Updating:', obj['destination']) |
|
|
|
|
|
|
|
|
# Get the feed |
|
|
# Get the feed |
|
|
r_feed = requests.get(feed_url, headers=requestheaders) |
|
|
r_feed = requests.get(feed_url, headers=requestheaders) |
|
@@ -94,7 +86,7 @@ def process_feed(obj): |
|
|
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ |
|
|
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ |
|
|
entry_path = assets_path + '/'+ entry_dir |
|
|
entry_path = assets_path + '/'+ entry_dir |
|
|
if not os.path.exists(entry_path): |
|
|
if not os.path.exists(entry_path): |
|
|
print('New item:', entry.link) |
|
|
|
|
|
|
|
|
print(timestamp(), 'New item:', entry.link) |
|
|
r = requests.get(entry.link.split('?')[0], headers=requestheaders) |
|
|
r = requests.get(entry.link.split('?')[0], headers=requestheaders) |
|
|
|
|
|
|
|
|
online_soup = BeautifulSoup(r.text, 'html.parser') |
|
|
online_soup = BeautifulSoup(r.text, 'html.parser') |
|
@@ -106,7 +98,7 @@ def process_feed(obj): |
|
|
element.extract() |
|
|
element.extract() |
|
|
|
|
|
|
|
|
# domain and path specific rules |
|
|
# domain and path specific rules |
|
|
# ... split strings for (very simple) ob+fu+sca+tion |
|
|
|
|
|
|
|
|
# ... ob+fu+sca+tion for seo |
|
|
|
|
|
|
|
|
if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): |
|
|
if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): |
|
|
if entry.date: |
|
|
if entry.date: |
|
@@ -116,10 +108,10 @@ def process_feed(obj): |
|
|
content_soup.div.append(article_headline) |
|
|
content_soup.div.append(article_headline) |
|
|
article_body = online_soup.find('div', attrs={'class': 'story-content'}) |
|
|
article_body = online_soup.find('div', attrs={'class': 'story-content'}) |
|
|
content_soup.div.append(article_body) |
|
|
content_soup.div.append(article_body) |
|
|
article_link = content_soup.new_tag('a', href=entry.link) |
|
|
|
|
|
article_link['class'] = 'source' |
|
|
|
|
|
article_link.string = 'Quelle (' + entry.link + ')' |
|
|
|
|
|
content_soup.div.append(article_link) |
|
|
|
|
|
|
|
|
article_source = content_soup.new_tag('a', href=entry.link) |
|
|
|
|
|
article_source['class'] = 'source' |
|
|
|
|
|
article_source.string = 'Quelle: ' + entry.link |
|
|
|
|
|
content_soup.div.append(article_source) |
|
|
|
|
|
|
|
|
if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) |
|
|
if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) |
|
|
if entry.published: |
|
|
if entry.published: |
|
@@ -142,10 +134,12 @@ def process_feed(obj): |
|
|
article_comments_p.append(article_comments_link) |
|
|
article_comments_p.append(article_comments_link) |
|
|
content_soup.div.append(article_comments_p) |
|
|
content_soup.div.append(article_comments_p) |
|
|
|
|
|
|
|
|
article_link = content_soup.new_tag('a', href=entry.link.split('?')[0]) |
|
|
|
|
|
article_link['class'] = 'source' |
|
|
|
|
|
article_link.string = 'Quelle: ' + entry.link.split('?')[0] |
|
|
|
|
|
content_soup.div.append(article_link) |
|
|
|
|
|
|
|
|
article_source = content_soup.new_tag('a', href=entry.link.split('?')[0]) |
|
|
|
|
|
article_source['class'] = 'source' |
|
|
|
|
|
article_source.string = 'Quelle: ' + entry.link.split('?')[0] |
|
|
|
|
|
article_source_p = content_soup.new_tag('p') |
|
|
|
|
|
article_source_p.append(article_source) |
|
|
|
|
|
content_soup.div.append(article_source_p) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# create directory for storing and serving html and images |
|
|
# create directory for storing and serving html and images |
|
@@ -185,7 +179,7 @@ def process_feed(obj): |
|
|
f.write(str(content_soup)) |
|
|
f.write(str(content_soup)) |
|
|
f.close() |
|
|
f.close() |
|
|
|
|
|
|
|
|
sleep(1.3) |
|
|
|
|
|
|
|
|
time.sleep(1.3) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Create new feed |
|
|
# Create new feed |
|
@@ -201,7 +195,7 @@ def process_feed(obj): |
|
|
matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()] |
|
|
matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()] |
|
|
if len(matches) > 0: |
|
|
if len(matches) > 0: |
|
|
e.extract() |
|
|
e.extract() |
|
|
print('Exclude: ', e.title.text, '->', matches) |
|
|
|
|
|
|
|
|
print(timestamp(), 'Exclude: ', e.title.text, '->', matches) |
|
|
|
|
|
|
|
|
for e in feed_soup.findAll('item'): |
|
|
for e in feed_soup.findAll('item'): |
|
|
entry_dir = get_valid_filename(e.link.text) |
|
|
entry_dir = get_valid_filename(e.link.text) |
|
@@ -215,7 +209,7 @@ def process_feed(obj): |
|
|
os.makedirs(feeds_path, exist_ok=True) |
|
|
os.makedirs(feeds_path, exist_ok=True) |
|
|
|
|
|
|
|
|
f = open(feeds_path + '/' + output_filename, 'w') |
|
|
f = open(feeds_path + '/' + output_filename, 'w') |
|
|
print('Done!') |
|
|
|
|
|
|
|
|
print(timestamp(), 'Done!') |
|
|
f.write(str(feed_soup)) |
|
|
f.write(str(feed_soup)) |
|
|
f.close() |
|
|
f.close() |
|
|
|
|
|
|
|
|