Procházet zdrojové kódy

add string exclusion

master
Andreas Demmelbauer před 5 roky
rodič
revize
309f78e1cb
2 změnil soubory, kde provedl 17 přidání a 5 odebrání
  1. +2
    -1
      config.example.json
  2. +15
    -4
      feedcake.py

+ 2
- 1
config.example.json Zobrazit soubor

@@ -3,7 +3,8 @@
"feeds" : [
{
"source" : "https://a.newspaper.tld/news.xml",
"destination": "newspaper.xml"
"destination": "newspaper.xml",
"exclude": ["10 most", "hate", "?!", "click", "horrific", "will shock you"]
},
{
"source" : "https://another.newspaper.tld/rss",


+ 15
- 4
feedcake.py Zobrazit soubor

@@ -28,7 +28,6 @@ except:
print("ERROR: Config file not found or invalid!")
sys.exit(1)

print(filedir)
public_path = filedir + '/public'
assets_path = public_path + '/assets'
feeds_path = public_path + '/feeds'
@@ -79,7 +78,11 @@ def download_image(url, entry_dir, filename):
for chunk in response.iter_content(1024):
f.write(chunk)

def process_feed(feed_url, output_filename):
def process_feed(obj):
feed_url = obj['source']
output_filename = obj['destination']

print('Updating:', obj['destination'])

# Get the feed
r_feed = requests.get(feed_url, headers=requestheaders)
@@ -88,10 +91,10 @@ def process_feed(feed_url, output_filename):

# Store data of new articles
for entry in feedparser.parse(r_feed.text).entries:
print(entry.link)
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
entry_path = assets_path + '/'+ entry_dir
if not os.path.exists(entry_path):
print('New item: ', entry.link)
r = requests.get(entry.link.split('?')[0], headers=requestheaders)

online_soup = BeautifulSoup(r.text, 'html.parser')
@@ -192,6 +195,14 @@ def process_feed(feed_url, output_filename):

feed_soup = BeautifulSoup(r_feed.text, 'lxml-xml')

# Exclude items
if obj.get('exclude') and isinstance(obj['exclude'], list):
for e in feed_soup.findAll('item'):
matches = [x for x in obj['exclude'] if x.lower() in e.title.text.lower()]
if len(matches) > 0:
e.extract()
print('Exclude: ', e.title.text, '->', matches)

for e in feed_soup.findAll('item'):
entry_dir = get_valid_filename(e.link.text)
f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')
@@ -213,4 +224,4 @@ def process_feed(feed_url, output_filename):
# Let's actually fetch the stuff!

for feed in config['feeds']:
process_feed(feed['source'], feed['destination'])
process_feed(feed)

Načítá se…
Zrušit
Uložit