Browse Source

log timestamps, editable request-headers, README

master
Andreas Demmelbauer 5 years ago
parent
commit
061e57323c
3 changed files with 29 additions and 32 deletions
  1. +1
    -1
      README.md
  2. +3
    -0
      config.example.json
  3. +25
    -31
      feedcake.py

+ 1
- 1
README.md View File

@@ -51,7 +51,7 @@ news pages
* edit `cron.sh` * edit `cron.sh`
* make `cron.sh` executable: `chmod +x cron.sh` * make `cron.sh` executable: `chmod +x cron.sh`
* add cronjob for `cron.sh`: `crontab -e` * add cronjob for `cron.sh`: `crontab -e`
* `*/5 * * * * /absolute/path/to/cron.sh > /path/to/logfile 2>&1`
* `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1`
* setup your webserver: * setup your webserver:
* let your webserver somehow point to the `feeds` directory. * let your webserver somehow point to the `feeds` directory.
You should protect the http path with a basic authentication. You should protect the http path with a basic authentication.


+ 3
- 0
config.example.json View File

@@ -1,5 +1,8 @@
{ {
"assets_url" : "https://yourdomain.tld/some-url", "assets_url" : "https://yourdomain.tld/some-url",
"request_headers": {
"user-agent" : "Mozilla/5.0 (compatible; Feedcake/0.1 friendly, non-profit newsfeed bot)"
},
"feeds" : [ "feeds" : [
{ {
"source" : "https://a.newspaper.tld/news.xml", "source" : "https://a.newspaper.tld/news.xml",


+ 25
- 31
feedcake.py View File

@@ -5,27 +5,31 @@ import sys
import re import re
import hashlib import hashlib
import json import json
from time import sleep
import time
import feedparser import feedparser
import requests import requests
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
from bs4.element import CData from bs4.element import CData




def timestamp():
return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']'


# default config location is a 'config.json' next to the script. # default config location is a 'config.json' next to the script.
try: try:
filedir = os.path.dirname(os.path.abspath(__file__)) filedir = os.path.dirname(os.path.abspath(__file__))
if len(sys.argv) < 2: if len(sys.argv) < 2:
configpath = filedir+'/config.json' configpath = filedir+'/config.json'
print("Using default config location: ", configpath)
print(timestamp(), "Using default config location: ", configpath)
config = json.load(open(configpath)) config = json.load(open(configpath))
else: else:
configpath = sys.argv[1] configpath = sys.argv[1]
config = json.load(open(configpath)) config = json.load(open(configpath))


except: except:
print("Problem reading config file: ", configpath)
print("ERROR: Config file not found or invalid!")
print(timestamp(), "Problem reading config file: ", configpath)
print(timestamp(), "ERROR: Config file not found or invalid!")
sys.exit(1) sys.exit(1)


public_path = filedir + '/public' public_path = filedir + '/public'
@@ -35,19 +39,7 @@ feeds_path = public_path + '/feeds'
# e.g. https://example.com/some-string # e.g. https://example.com/some-string
assets_url = config['assets_url'] assets_url = config['assets_url']





# "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
requestheaders = {
'user-'+'age'+'nt' :
'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
+ 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
+ 'og'+ 'le'+'.com/'+'bo'+'t.html)'
}



requestheaders = config['request_headers']


# need filname safe strings for storing images along html files # need filname safe strings for storing images along html files
def get_valid_filename(s): def get_valid_filename(s):
@@ -82,7 +74,7 @@ def process_feed(obj):
feed_url = obj['source'] feed_url = obj['source']
output_filename = obj['destination'] output_filename = obj['destination']


print('Updating:', obj['destination'])
print(timestamp(), 'Updating:', obj['destination'])


# Get the feed # Get the feed
r_feed = requests.get(feed_url, headers=requestheaders) r_feed = requests.get(feed_url, headers=requestheaders)
@@ -94,7 +86,7 @@ def process_feed(obj):
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
entry_path = assets_path + '/'+ entry_dir entry_path = assets_path + '/'+ entry_dir
if not os.path.exists(entry_path): if not os.path.exists(entry_path):
print('New item:', entry.link)
print(timestamp(), 'New item:', entry.link)
r = requests.get(entry.link.split('?')[0], headers=requestheaders) r = requests.get(entry.link.split('?')[0], headers=requestheaders)


online_soup = BeautifulSoup(r.text, 'html.parser') online_soup = BeautifulSoup(r.text, 'html.parser')
@@ -106,7 +98,7 @@ def process_feed(obj):
element.extract() element.extract()


# domain and path specific rules # domain and path specific rules
# ... split strings for (very simple) ob+fu+sca+tion
# ... ob+fu+sca+tion for seo


if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
if entry.date: if entry.date:
@@ -116,10 +108,10 @@ def process_feed(obj):
content_soup.div.append(article_headline) content_soup.div.append(article_headline)
article_body = online_soup.find('div', attrs={'class': 'story-content'}) article_body = online_soup.find('div', attrs={'class': 'story-content'})
content_soup.div.append(article_body) content_soup.div.append(article_body)
article_link = content_soup.new_tag('a', href=entry.link)
article_link['class'] = 'source'
article_link.string = 'Quelle (' + entry.link + ')'
content_soup.div.append(article_link)
article_source = content_soup.new_tag('a', href=entry.link)
article_source['class'] = 'source'
article_source.string = 'Quelle: ' + entry.link
content_soup.div.append(article_source)


if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
if entry.published: if entry.published:
@@ -142,10 +134,12 @@ def process_feed(obj):
article_comments_p.append(article_comments_link) article_comments_p.append(article_comments_link)
content_soup.div.append(article_comments_p) content_soup.div.append(article_comments_p)


article_link = content_soup.new_tag('a', href=entry.link.split('?')[0])
article_link['class'] = 'source'
article_link.string = 'Quelle: ' + entry.link.split('?')[0]
content_soup.div.append(article_link)
article_source = content_soup.new_tag('a', href=entry.link.split('?')[0])
article_source['class'] = 'source'
article_source.string = 'Quelle: ' + entry.link.split('?')[0]
article_source_p = content_soup.new_tag('p')
article_source_p.append(article_source)
content_soup.div.append(article_source_p)




# create directory for storing and serving html and images # create directory for storing and serving html and images
@@ -185,7 +179,7 @@ def process_feed(obj):
f.write(str(content_soup)) f.write(str(content_soup))
f.close() f.close()


sleep(1.3)
time.sleep(1.3)




# Create new feed # Create new feed
@@ -201,7 +195,7 @@ def process_feed(obj):
matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()] matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
if len(matches) > 0: if len(matches) > 0:
e.extract() e.extract()
print('Exclude: ', e.title.text, '->', matches)
print(timestamp(), 'Exclude: ', e.title.text, '->', matches)


for e in feed_soup.findAll('item'): for e in feed_soup.findAll('item'):
entry_dir = get_valid_filename(e.link.text) entry_dir = get_valid_filename(e.link.text)
@@ -215,7 +209,7 @@ def process_feed(obj):
os.makedirs(feeds_path, exist_ok=True) os.makedirs(feeds_path, exist_ok=True)


f = open(feeds_path + '/' + output_filename, 'w') f = open(feeds_path + '/' + output_filename, 'w')
print('Done!')
print(timestamp(), 'Done!')
f.write(str(feed_soup)) f.write(str(feed_soup))
f.close() f.close()




Loading…
Cancel
Save