Przeglądaj źródła

log timestamps, editable request-headers, README

master
Andreas Demmelbauer 5 lat temu
rodzic
commit
061e57323c
3 zmienionych plików z 29 dodań i 32 usunięć
  1. +1
    -1
      README.md
  2. +3
    -0
      config.example.json
  3. +25
    -31
      feedcake.py

+ 1
- 1
README.md Wyświetl plik

@@ -51,7 +51,7 @@ news pages
* edit `cron.sh`
* make `cron.sh` executable: `chmod +x cron.sh`
* add cronjob for `cron.sh`: `crontab -e`
* `*/5 * * * * /absolute/path/to/cron.sh > /path/to/logfile 2>&1`
* `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1`
* setup your webserver:
* let your webserver somehow point to the `feeds` directory.
You should protect the http path with a basic authentication.


+ 3
- 0
config.example.json Wyświetl plik

@@ -1,5 +1,8 @@
{
"assets_url" : "https://yourdomain.tld/some-url",
"request_headers": {
"user-agent" : "Mozilla/5.0 (compatible; Feedcake/0.1 friendly, non-profit newsfeed bot)"
},
"feeds" : [
{
"source" : "https://a.newspaper.tld/news.xml",


+ 25
- 31
feedcake.py Wyświetl plik

@@ -5,27 +5,31 @@ import sys
import re
import hashlib
import json
from time import sleep
import time
import feedparser
import requests
from bs4 import BeautifulSoup, Comment
from bs4.element import CData


def timestamp():
return '[' + time.strftime("%d/%b/%Y:%H:%M:%S %z", time.localtime()) + ']'


# default config location is a 'config.json' next to the script.
try:
filedir = os.path.dirname(os.path.abspath(__file__))
if len(sys.argv) < 2:
configpath = filedir+'/config.json'
print("Using default config location: ", configpath)
print(timestamp(), "Using default config location: ", configpath)
config = json.load(open(configpath))
else:
configpath = sys.argv[1]
config = json.load(open(configpath))

except:
print("Problem reading config file: ", configpath)
print("ERROR: Config file not found or invalid!")
print(timestamp(), "Problem reading config file: ", configpath)
print(timestamp(), "ERROR: Config file not found or invalid!")
sys.exit(1)

public_path = filedir + '/public'
@@ -35,19 +39,7 @@ feeds_path = public_path + '/feeds'
# e.g. https://example.com/some-string
assets_url = config['assets_url']




# "I'm a robot which promises you clicks and $ ... Give me ALL your content!"
requestheaders = {
'user-'+'age'+'nt' :
'Mo' + 'zill' + 'a/5.' + '0 (' + 'comp' + 'ati' + 'ble; '+'Go'
+ 'og'+'le'+ 'bo' + 't/' + '2.1; +http' + '://www.' + 'go'
+ 'og'+ 'le'+'.com/'+'bo'+'t.html)'
}



requestheaders = config['request_headers']

# need filname safe strings for storing images along html files
def get_valid_filename(s):
@@ -82,7 +74,7 @@ def process_feed(obj):
feed_url = obj['source']
output_filename = obj['destination']

print('Updating:', obj['destination'])
print(timestamp(), 'Updating:', obj['destination'])

# Get the feed
r_feed = requests.get(feed_url, headers=requestheaders)
@@ -94,7 +86,7 @@ def process_feed(obj):
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
entry_path = assets_path + '/'+ entry_dir
if not os.path.exists(entry_path):
print('New item:', entry.link)
print(timestamp(), 'New item:', entry.link)
r = requests.get(entry.link.split('?')[0], headers=requestheaders)

online_soup = BeautifulSoup(r.text, 'html.parser')
@@ -106,7 +98,7 @@ def process_feed(obj):
element.extract()

# domain and path specific rules
# ... split strings for (very simple) ob+fu+sca+tion
# ... ob+fu+sca+tion for seo

if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'):
if entry.date:
@@ -116,10 +108,10 @@ def process_feed(obj):
content_soup.div.append(article_headline)
article_body = online_soup.find('div', attrs={'class': 'story-content'})
content_soup.div.append(article_body)
article_link = content_soup.new_tag('a', href=entry.link)
article_link['class'] = 'source'
article_link.string = 'Quelle (' + entry.link + ')'
content_soup.div.append(article_link)
article_source = content_soup.new_tag('a', href=entry.link)
article_source['class'] = 'source'
article_source.string = 'Quelle: ' + entry.link
content_soup.div.append(article_source)

if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :)
if entry.published:
@@ -142,10 +134,12 @@ def process_feed(obj):
article_comments_p.append(article_comments_link)
content_soup.div.append(article_comments_p)

article_link = content_soup.new_tag('a', href=entry.link.split('?')[0])
article_link['class'] = 'source'
article_link.string = 'Quelle: ' + entry.link.split('?')[0]
content_soup.div.append(article_link)
article_source = content_soup.new_tag('a', href=entry.link.split('?')[0])
article_source['class'] = 'source'
article_source.string = 'Quelle: ' + entry.link.split('?')[0]
article_source_p = content_soup.new_tag('p')
article_source_p.append(article_source)
content_soup.div.append(article_source_p)


# create directory for storing and serving html and images
@@ -185,7 +179,7 @@ def process_feed(obj):
f.write(str(content_soup))
f.close()

sleep(1.3)
time.sleep(1.3)


# Create new feed
@@ -201,7 +195,7 @@ def process_feed(obj):
matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
if len(matches) > 0:
e.extract()
print('Exclude: ', e.title.text, '->', matches)
print(timestamp(), 'Exclude: ', e.title.text, '->', matches)

for e in feed_soup.findAll('item'):
entry_dir = get_valid_filename(e.link.text)
@@ -215,7 +209,7 @@ def process_feed(obj):
os.makedirs(feeds_path, exist_ok=True)

f = open(feeds_path + '/' + output_filename, 'w')
print('Done!')
print(timestamp(), 'Done!')
f.write(str(feed_soup))
f.close()



Ładowanie…
Anuluj
Zapisz