Explorar el Código

added soup.io, added dry run, changed cli, added config file

master
Andreas Demmelbauer hace 6 años
padre
commit
35a5e3b7f8
Se han modificado 7 ficheros con 260 adiciones y 83 borrados
  1. +1
    -1
      .gitignore
  2. +12
    -10
      README.md
  3. +16
    -0
      config-example.json
  4. +5
    -0
      cron-example.sh
  5. +0
    -11
      cron-sample.sh
  6. +1
    -1
      requirements.txt
  7. +225
    -60
      tootbot.py

+ 1
- 1
.gitignore Ver fichero

@@ -1,3 +1,3 @@
*.secret
cron.sh
config.json

+ 12
- 10
README.md Ver fichero

@@ -1,21 +1,23 @@
# TootBot
# TootBot Spicy Edition

A small python 3.x script to replicate tweets on a mastodon account.
A small python 3.x script to replicate tweets and posts from soup.io on a mastodon account.

The script only need mastodon login/pass to post toots.

It gets the tweets from RSS available at http://twitrss.me, then does some cleanup on the content:
For Twitter, it gets the tweets from RSS available at http://twitrss.me, then does some cleanup on the content:
- twitter tracking links (t.co) are dereferenced
- twitter hosted pictures are retrieved and uploaded to mastodon

A sqlite database is used to keep track of tweets than have been tooted.
For Soup, it uses the official RSS feed (e.g. https://metalab.soup.io/rss), then does some cleanup on the content:
- checks if source is twitter and checks for duplicates
- removes html stuff
- looks for one picture in source link or in soup post and uploads it to mastodon
- adds source url (if no source is specified, soup post is source url)


This script is in use for a few accounts:
- cq94 -> https://amicale.net/@cquest
- Etalab -> https://mastodon.etalab.gouv.fr/@etalab
- datagouvfr -> https://mastodon.etalab.gouv.fr/@datagouvfr
- osm_fr -> https://fr.osm.social/@osm_fr
- sotmfr -> https://fr.osm.social/@sotmfr
A sqlite database is used to keep track of items than have been tooted.


this script is based on TootBot: https://github.com/cquest/tootbot

The script is simply called by a cron job and can run on any server (does not have to be on the mastodon instance server).

+ 16
- 0
config-example.json Ver fichero

@@ -0,0 +1,16 @@
{
"mastodon": {
"instance": "mastodon.social",
"user": "loginemail@example.com",
"password": "correcthorsebatterystaple"
},
"sources": {
"twitter": "metalabVie",
"soup": "metalab.soup.io"
},
"settings": {
"days": 1,
"dryrun": true,
"databasefilepath": "tootbot.db"
}
}

+ 5
- 0
cron-example.sh Ver fichero

@@ -0,0 +1,5 @@
# activate virtualenv if necessary
# source /home/cquest/.virtualenvs/tootbot/bin/activate

python tootbot.py config.json
python tootbot.py anotherconfig.json

+ 0
- 11
cron-sample.sh Ver fichero

@@ -1,11 +0,0 @@
# activate virtualenv if necessary
# source /home/cquest/.virtualenvs/tootbot/bin/activate

# parameters:
# 1- twitter account to clone
# 2- mastodon login
# 3- mastodon password
# 4- instance domain (https:// is automatically added)

python tootbot.py geonym_fr geonym@amicale.net **password** test.amicale.net
python tootbot.py cq94 cquest@amicale.net **password** test.amicale.net

+ 1
- 1
requirements.txt Ver fichero

@@ -1,4 +1,4 @@
feedparser
Mastodon.py
requests
html2text

+ 225
- 60
tootbot.py Ver fichero

@@ -6,80 +6,125 @@ import json
import requests
import re
import sqlite3
import html2text
from datetime import datetime, date, time, timedelta

if len(sys.argv) < 4:
print("Usage: python3 tootbot.py twitter_account mastodon_login mastodon_passwd mastodon_instance")


# default config location is a 'config.json' next to the script.
try:
filedir = os.path.dirname(os.path.abspath(__file__))
if len(sys.argv) < 2:
print("Using default config location: %s/config.json" % filedir)
config = json.load(open(filedir+'/config.json'))
else:
config = json.load(open(sys.argv[1]))

except:
print("ERROR: Config file not found!")
sys.exit(1)


mastinstance = config['mastodon']['instance']
mastuser = config['mastodon']['user']
mastpasswd = config['mastodon']['password']

twitter = config['sources']['twitter']
soup = config['sources']['soup']

dryrun = config['settings']['dryrun']
days = config['settings']['days']


# sqlite db to store processed tweets (and corresponding toots ids)
sql = sqlite3.connect('tootbot.db')
sql = sqlite3.connect(config['settings']['databasefilepath'])
db = sql.cursor()
db.execute('''CREATE TABLE IF NOT EXISTS tweets (tweet text, toot text, twitter text, mastodon text, instance text)''')
db.execute('''CREATE TABLE IF NOT EXISTS posts (srcpost text, srcuser text, mastpost text, mastuser text, mastinstance text)''')

mastodon_api = None

def register_app(mastuser,mastpasswd,mastinstance,mastodon_api):
if mastodon_api is None:
if not os.path.isfile(mastinstance+'.secret'):
if Mastodon.create_app(
'metasyndicator',
api_base_url='https://'+mastinstance,
to_file = mastinstance+'.secret'
):
print('app created on instance '+mastinstance)
else:
print('failed to create app on instance '+mastinstance)
sys.exit(1)

try:
mastodon_api = Mastodon(
client_id=mastinstance+'.secret',
api_base_url='https://'+mastinstance
)
mastodon_api.log_in(
username=mastuser,
password=mastpasswd,
scopes=['read', 'write'],
to_file=mastuser+".secret"
)
return mastodon_api
except:
print("ERROR: First Login Failed!")
sys.exit(1)

if len(sys.argv)>4:
instance = sys.argv[4]
else:
instance = 'amicale.net'

if len(sys.argv)>5:
days = int(sys.argv[5])
else:
days = 1

twitter = sys.argv[1]
mastodon = sys.argv[2]
passwd = sys.argv[3]

mastodon_api = None

d = feedparser.parse('http://twitrss.me/twitter_user_to_rss/?user='+twitter)
# twitter section

for t in reversed(d.entries):
print('====== TWITTER ======')

t = feedparser.parse('http://twitrss.me/twitter_user_to_rss/?user='+twitter)

for p in reversed(t.entries):
# check if this tweet has been processed
db.execute('SELECT * FROM tweets WHERE tweet = ? AND twitter = ? and mastodon = ? and instance = ?',(t.id, twitter, mastodon, instance))
db.execute(
'SELECT * FROM posts WHERE srcpost = ? AND srcuser = ? AND mastuser = ? AND mastinstance = ?',
(p.id, twitter, mastuser, mastinstance)
)
last = db.fetchone()

# process only unprocessed tweets less than 1 day old
if last is None and (datetime.now()-datetime(t.published_parsed.tm_year, t.published_parsed.tm_mon, t.published_parsed.tm_mday, t.published_parsed.tm_hour, t.published_parsed.tm_min, t.published_parsed.tm_sec) < timedelta(days=days)):
if mastodon_api is None:
# Create application if it does not exist
if not os.path.isfile(instance+'.secret'):
if Mastodon.create_app(
'tootbot',
api_base_url='https://'+instance,
to_file = instance+'.secret'
):
print('tootbot app created on instance '+instance)
else:
print('failed to create app on instance '+instance)
sys.exit(1)
shouldpost = True
posttime = datetime(p.published_parsed.tm_year, p.published_parsed.tm_mon, p.published_parsed.tm_mday, p.published_parsed.tm_hour, p.published_parsed.tm_min, p.published_parsed.tm_sec)
if last is not None:
shouldpost = False
print("skip: already posted")
# process only unprocessed tweets less than n days old
if datetime.now() - posttime > timedelta(days=days):
shouldpost = False
print("skip: Post too old")
# kill tweets with fb links with fire!
if "https://www.facebook.com" in p.title or "https://m.facebook.com" in p.title:
shouldpost = False
print("skip: a Tweet that links to facebook? ... That's too much.")

try:
mastodon_api = Mastodon(
client_id=instance+'.secret',
api_base_url='https://'+instance
)
mastodon_api.log_in(
username=mastodon,
password=passwd,
scopes=['read', 'write'],
to_file=mastodon+".secret"
)
except:
print("ERROR: First Login Failed!")
sys.exit(1)

#h = BeautifulSoup(t.summary_detail.value, "html.parser")
c = t.title
if t.author != '(%s)' % twitter:
c = ("RT %s\n" % t.author[1:-1]) + c
if shouldpost:
print(posttime)
# Create application if it does not exist
mastodon_api = register_app(mastuser, mastpasswd, mastinstance, mastodon_api)

c = p.title

if p.author != '(%s)' % twitter:
c = ("RT %s\n" % p.author[1:-1]) + c
toot_media = []
# get the pictures...
for p in re.finditer(r"https://pbs.twimg.com/[^ \xa0\"]*", t.summary):
media = requests.get(p.group(0))
media_posted = mastodon_api.media_post(media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
for pic in re.finditer(r"https://pbs.twimg.com/[^ \xa0\"]*", p.summary):
if (not dryrun):
media = requests.get(pic.group(0))
media_posted = mastodon_api.media_post(media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
media = None
else:
print('Dryrun: not fetching ', pic.group(0), ' and not uploading it to mastodon')

# replace t.co link by original URL
m = re.search(r"http[^ \xa0]*", c)
@@ -97,10 +142,130 @@ for t in reversed(d.entries):

# remove ellipsis
c = c.replace('\xa0…',' ')
print(c)

if toot_media is not None:
toot = mastodon_api.status_post(c, in_reply_to_id=None, media_ids=toot_media, sensitive=False, visibility='public', spoiler_text=None)
if "id" in toot:
db.execute("INSERT INTO tweets VALUES ( ? , ? , ? , ? , ? )",
(t.id, toot["id"], twitter, mastodon, instance))
if (not dryrun):
toot = mastodon_api.status_post(c, in_reply_to_id=None, media_ids=toot_media, sensitive=False, visibility='unlisted', spoiler_text=None)
print( '--> toot posted!')
try:
db.execute("INSERT INTO posts VALUES ( ? , ? , ? , ? , ? )", (p.id, twitter, toot.id, mastuser, mastinstance))
sql.commit()
except:
print('database execution failed.')
print('p.id: ', p.id)
print('toot.id: ', toot.id)

else:
print('Dryrun: not posting toot and not adding it to database')
print('------------------------')



# soup.io section

print('====== SOUP ======')

h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 0

s = feedparser.parse('http://'+soup+'/rss')

for p in reversed(s.entries):
# check if this tweet has been processed
db.execute(
'SELECT * FROM posts WHERE srcpost = ? AND srcuser = ? AND mastuser = ? AND mastinstance = ?',
(p.id, soup, mastuser, mastinstance)
)
last = db.fetchone()

# process only unprocessed post less than n days old
posttime = datetime(p.published_parsed.tm_year, p.published_parsed.tm_mon, p.published_parsed.tm_mday, p.published_parsed.tm_hour, p.published_parsed.tm_min, p.published_parsed.tm_sec)
if last is None and (datetime.now() - posttime < timedelta(days=days)):
# Create application if it does not exist
mastodon_api = register_app(mastuser, mastpasswd, mastinstance, mastodon_api)

print(p.link)
j = json.loads(p.soup_attributes)

# get status id and user if twitter is source
twitterstatus = None
twitteruser = None
if (isinstance(j['source'], str)):
if ( j['source'].startswith('https://twitter.com/') or j['source'].startswith('https://mobile.twitter.com/')):
twitterurl = j['source'].split('/')
twitteruser = twitterurl[3]
if ( twitterurl[4] == 'status'):
twitterstatus = twitterurl[5]

# get all tweeted statuses
db.execute('SELECT srcpost FROM posts where srcuser = ?', (twitter,))
postedtweets = []
for postedtweet in db.fetchall():
postedtweets.append(postedtweet[0].split('/')[-1])

# check if already tweeted
if twitterstatus in postedtweets:
print('Already tweeted: ', j['source'])

else:
# collect information about images
pics = []
accepted_filetypes = ('.jpg', '.jpeg', '.png', '.webm', '.JPG', '.JPEG', '.PNG', '.WEBM') # let's don't do mp4 for now.
if (isinstance(j['source'], str) and j['source'].endswith(accepted_filetypes) ):
pics.append(j['source'])
elif ( 'url' in j and isinstance(j['url'], str) and j['url'].endswith(accepted_filetypes) ):
pics.append(j['url'])

# get the images and post them to mastadon ...
toot_media = []
for pic in pics:
if (not dryrun):
media = requests.get(pic)
print(pic, ' has mimetype ', media.headers.get('content-type'))
media_posted = mastodon_api.media_post(media.content, mime_type=media.headers.get('content-type'))
toot_media.append(media_posted['id'])
else:
print('Dryrun: not fetching ', pic, ' and not uploading it to mastodon')


# remove all html stuff - python module in use only supports markdown, not pure plaintext
textsrc = h.handle(p.summary_detail.value.replace("<small>", "<br><small>"))
# free text from lines without visible characters
cleantextsrc = ''
for line in textsrc.split('\n'):
line = line.strip()
cleantextsrc += line + '\n'

# strip newlines, reduce newlines, remove markdown bold (i know, ugly), do some clean up
text = cleantextsrc.strip('\n').replace('\n\n\n','\n\n').replace('**','').replace('\\--','')

# link directly to source or use soup as source.
if (isinstance(j['source'], str) and j['source'] not in text):
source = '\n\nSource: ' + j['source']
else:
source = '\n\nSource: ' + p.link

# shorten text if too long
maximumlegth = 500 - 1 - len(source) - 50 # 50 ... just in case (if they also count attachement url and so on)
text = (text[:maximumlegth] + '…') if len(text) > maximumlegth else text

# add source
text += source

print(text)

if (not dryrun):
# post toot
toot = mastodon_api.status_post(text, in_reply_to_id=None, media_ids=toot_media, sensitive=False, visibility='public', spoiler_text=None)

# add entry to database
if "id" in toot:
db.execute("INSERT INTO posts VALUES ( ? , ? , ? , ? , ? )", (p.id, soup, toot.id, mastuser, mastinstance))
sql.commit()
print( '--> ', p.id, ' posted!')
else:
print('Dryrun: not posting toot and not adding it to database')

print('------------------------')

Cargando…
Cancelar
Guardar