|
@@ -32,6 +32,7 @@ except: |
|
|
print(timestamp(), "ERROR: Config file not found or invalid!") |
|
|
print(timestamp(), "ERROR: Config file not found or invalid!") |
|
|
sys.exit(1) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
|
|
# File location of feeds and assets are in directories next to the script |
|
|
public_path = filedir + '/public' |
|
|
public_path = filedir + '/public' |
|
|
assets_path = public_path + '/assets' |
|
|
assets_path = public_path + '/assets' |
|
|
feeds_path = public_path + '/feeds' |
|
|
feeds_path = public_path + '/feeds' |
|
@@ -70,18 +71,19 @@ def download_image(url, entry_dir, filename): |
|
|
for chunk in response.iter_content(1024): |
|
|
for chunk in response.iter_content(1024): |
|
|
f.write(chunk) |
|
|
f.write(chunk) |
|
|
|
|
|
|
|
|
|
|
|
# process the feed entries, specified in the config |
|
|
def process_feed(obj): |
|
|
def process_feed(obj): |
|
|
feed_url = obj['source'] |
|
|
feed_url = obj['source'] |
|
|
output_filename = obj['destination'] |
|
|
output_filename = obj['destination'] |
|
|
|
|
|
|
|
|
print(timestamp(), 'Updating:', obj['destination']) |
|
|
print(timestamp(), 'Updating:', obj['destination']) |
|
|
|
|
|
|
|
|
# Get the feed |
|
|
|
|
|
|
|
|
# Step 1: Get the feed |
|
|
r_feed = requests.get(feed_url, headers=requestheaders) |
|
|
r_feed = requests.get(feed_url, headers=requestheaders) |
|
|
|
|
|
|
|
|
# TODO: exceptions.(what if 404 or whatever?) |
|
|
# TODO: exceptions.(what if 404 or whatever?) |
|
|
|
|
|
|
|
|
# Store data of new articles |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Step 2: Scrape and store data of new articles |
|
|
for entry in feedparser.parse(r_feed.text).entries: |
|
|
for entry in feedparser.parse(r_feed.text).entries: |
|
|
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ |
|
|
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ |
|
|
entry_path = assets_path + '/'+ entry_dir |
|
|
entry_path = assets_path + '/'+ entry_dir |
|
@@ -108,7 +110,7 @@ def process_feed(obj): |
|
|
content_soup.div.append(article_headline) |
|
|
content_soup.div.append(article_headline) |
|
|
article_body = online_soup.find('div', attrs={'class': 'story-content'}) |
|
|
article_body = online_soup.find('div', attrs={'class': 'story-content'}) |
|
|
content_soup.div.append(article_body) |
|
|
content_soup.div.append(article_body) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Add a link to original article |
|
|
# Add a link to original article |
|
|
article_source = content_soup.new_tag('a', href=entry.link) |
|
|
article_source = content_soup.new_tag('a', href=entry.link) |
|
|
article_source['class'] = 'source' |
|
|
article_source['class'] = 'source' |
|
@@ -145,7 +147,6 @@ def process_feed(obj): |
|
|
article_source_p.append(article_source) |
|
|
article_source_p.append(article_source) |
|
|
content_soup.div.append(article_source_p) |
|
|
content_soup.div.append(article_source_p) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# create directory for storing and serving html and images |
|
|
# create directory for storing and serving html and images |
|
|
os.makedirs(entry_path) |
|
|
os.makedirs(entry_path) |
|
|
|
|
|
|
|
@@ -178,15 +179,16 @@ def process_feed(obj): |
|
|
|
|
|
|
|
|
# TODO(?): HTML5 picture tag |
|
|
# TODO(?): HTML5 picture tag |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Save HTML |
|
|
f = open(entry_path + '/index.html', 'w') |
|
|
f = open(entry_path + '/index.html', 'w') |
|
|
f.write(str(content_soup)) |
|
|
f.write(str(content_soup)) |
|
|
f.close() |
|
|
f.close() |
|
|
|
|
|
|
|
|
|
|
|
# Wait a bit |
|
|
time.sleep(1.3) |
|
|
time.sleep(1.3) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Create new feed |
|
|
|
|
|
|
|
|
# Step 3: Create 'new' feed |
|
|
|
|
|
|
|
|
# Maybe buiding a new feed from scretch using a template would be nicer but ... |
|
|
# Maybe buiding a new feed from scretch using a template would be nicer but ... |
|
|
# let's just modify the original one! |
|
|
# let's just modify the original one! |
|
@@ -201,6 +203,7 @@ def process_feed(obj): |
|
|
e.extract() |
|
|
e.extract() |
|
|
print(timestamp(), 'Exclude: ', e.title.text, '->', matches) |
|
|
print(timestamp(), 'Exclude: ', e.title.text, '->', matches) |
|
|
|
|
|
|
|
|
|
|
|
# Add content |
|
|
for e in feed_soup.findAll('item'): |
|
|
for e in feed_soup.findAll('item'): |
|
|
entry_dir = get_valid_filename(e.link.text) |
|
|
entry_dir = get_valid_filename(e.link.text) |
|
|
f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') |
|
|
f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') |
|
|