(text change) |
(mayor changes) |
||
Line 1: | Line 1: | ||
== Idiomaggio - a language detection bot == | |||
[[File:header.png]] | |||
"The limits of my language are the limits of my world." (Ludwig Wittgenstein) | |||
Idiomaggio is a twitter bot that detects the language of tweets and responds automatically in the right tongue. In a multilingual world there can be many different languages surround you. With the help of Idiomaggio you can figure out which language is spoken | |||
The twitter bot is working with the NLTK library; mainly with the built-in function ''stop words''. Linking words, conjunctions, articles and pronouns are words that make a language speakable. However, they don't create the meaning of a text. Idiomaggio filters the stop words out and detects their language. In a second step, the tweet will be responded in the correct language. | |||
'''[https://twitter.com/idiomaggio]''' | |||
Idiamaggio understands the European languages Swedish, Danish, Hungarian, Finnish, Portugese, German, Dutch, French, Spanish, Norwegian, English, Russian, Turkish and Italian. | |||
[[File:English.png]] | |||
[[File:Spanish.png]] | |||
[[File:French.png]] | |||
[[File:Italian.png]] | |||
'''The Code''' | |||
<source lang="python"> | |||
#!/usr/bin/env python2 | |||
# -*- coding: utf-8 -*- # | |||
from twitterbot import TwitterBot | |||
import keys | |||
import nltk | |||
from nltk import wordpunct_tokenize | |||
from nltk.corpus import stopwords | |||
class Idiomaggio(TwitterBot): | |||
def bot_init(self): | |||
""" | |||
Use your own consumer key to make the bot alive. | |||
""" | |||
############################ | |||
# REQUIRED: LOGIN DETAILS! # | |||
############################ | |||
self.config['api_key'] = keys.consumer_key | |||
self.config['api_secret'] = keys.consumer_secret | |||
self.config['access_key'] = keys.access_token | |||
self.config['access_secret'] = keys.access_token_secret | |||
###################################### | |||
# SEMI-OPTIONAL: OTHER CONFIG STUFF! # | |||
###################################### | |||
# how often to tweet, in seconds | |||
self.config['tweet_interval'] = 1 * 10 # default: 30 minutes | |||
# use this to define a (min, max) random range of how often to tweet | |||
# e.g., self.config['tweet_interval_range'] = (5*60, 10*60) # tweets every 5-10 minutes | |||
self.config['tweet_interval_range'] = None | |||
# only reply to tweets that specifically mention the bot | |||
self.config['reply_direct_mention_only'] = True | |||
# only include bot followers (and original tweeter) in @-replies | |||
self.config['reply_followers_only'] = True | |||
# fav any tweets that mention this bot? | |||
self.config['autofav_mentions'] = False | |||
# fav any tweets containing these keywords? | |||
self.config['autofav_keywords'] = [] | |||
# follow back all followers? | |||
self.config['autofollow'] = False | |||
########################################### | |||
# CUSTOM: your bot's own state variables! # | |||
########################################### | |||
def on_scheduled_tweet(self): | |||
pass | |||
def on_mention(self, tweet, prefix): | |||
text = tweet.text | |||
print(text) | |||
print(type(text)) | |||
percentage = {} | |||
tokens = wordpunct_tokenize(text) | |||
words = [] | |||
for word in tokens: | |||
words.append(word.lower()) | |||
for language in stopwords.fileids(): | |||
stopwords_set = set(stopwords.words(language)) | |||
words_set = set(words) | |||
most_common = words_set.intersection(stopwords_set) | |||
percentage[language] = len(most_common) | |||
most_probable = max(percentage, key=percentage.get) | |||
if most_probable == "danish": | |||
response = u'Hej! Taler du dansk?' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "dutch": | |||
response = u'Hi! Groeten uit Holland.' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "english": | |||
response = u'Hey! I speak some English.' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "finnish": | |||
response = u'Hei! Terveisiä Suomi.' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "french": | |||
response = u'Salut! Parlez-vous français?' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "german": | |||
response = u'Hey! Sprichst du deutsch?' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "hungarian": | |||
response = u'Hello! Beszélsz magyarul?' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "italian": | |||
response = u'Ciao! Saluti da Italia.' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "norwegian": | |||
response = u'Hei! Jeg snakker norsk.' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "portuguese": | |||
response = u'Olá! Você fala português?' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "russian": | |||
response = u'Привет! Привет из России.' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "spanish": | |||
response = u'¡Hola! Saludos desde España.' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "swedish": | |||
response = u'Hej! Talar du svenska?' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
if most_probable == "turkish": | |||
response = u'Merhaba! Türkçe biliyor musun?' | |||
prefixed = prefix + ' ' + response | |||
self.post_tweet(prefixed, reply_to=tweet) | |||
# print(percentage) | |||
# print(stopwords_set) | |||
# print(words_set) | |||
print("The language of your text is %s" % most_probable) | |||
def on_timeline(self, tweet, prefix): | |||
pass | |||
if __name__ == '__main__': | |||
bot = Idiomaggio() | |||
bot.run() | |||
</source> | |||
[[File:myrobot.png|100px100px|thumb|left]] | [[File:myrobot.png|100px100px|thumb|left]] |
Revision as of 20:41, 3 October 2015
Idiomaggio - a language detection bot
"The limits of my language are the limits of my world." (Ludwig Wittgenstein)
Idiomaggio is a twitter bot that detects the language of tweets and responds automatically in the right tongue. In a multilingual world there can be many different languages surround you. With the help of Idiomaggio you can figure out which language is spoken
The twitter bot is working with the NLTK library; mainly with the built-in function stop words. Linking words, conjunctions, articles and pronouns are words that make a language speakable. However, they don't create the meaning of a text. Idiomaggio filters the stop words out and detects their language. In a second step, the tweet will be responded in the correct language.
Idiamaggio understands the European languages Swedish, Danish, Hungarian, Finnish, Portugese, German, Dutch, French, Spanish, Norwegian, English, Russian, Turkish and Italian.
The Code
#!/usr/bin/env python2
# -*- coding: utf-8 -*- #
from twitterbot import TwitterBot
import keys
import nltk
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
class Idiomaggio(TwitterBot):
def bot_init(self):
"""
Use your own consumer key to make the bot alive.
"""
############################
# REQUIRED: LOGIN DETAILS! #
############################
self.config['api_key'] = keys.consumer_key
self.config['api_secret'] = keys.consumer_secret
self.config['access_key'] = keys.access_token
self.config['access_secret'] = keys.access_token_secret
######################################
# SEMI-OPTIONAL: OTHER CONFIG STUFF! #
######################################
# how often to tweet, in seconds
self.config['tweet_interval'] = 1 * 10 # default: 30 minutes
# use this to define a (min, max) random range of how often to tweet
# e.g., self.config['tweet_interval_range'] = (5*60, 10*60) # tweets every 5-10 minutes
self.config['tweet_interval_range'] = None
# only reply to tweets that specifically mention the bot
self.config['reply_direct_mention_only'] = True
# only include bot followers (and original tweeter) in @-replies
self.config['reply_followers_only'] = True
# fav any tweets that mention this bot?
self.config['autofav_mentions'] = False
# fav any tweets containing these keywords?
self.config['autofav_keywords'] = []
# follow back all followers?
self.config['autofollow'] = False
###########################################
# CUSTOM: your bot's own state variables! #
###########################################
def on_scheduled_tweet(self):
pass
def on_mention(self, tweet, prefix):
text = tweet.text
print(text)
print(type(text))
percentage = {}
tokens = wordpunct_tokenize(text)
words = []
for word in tokens:
words.append(word.lower())
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
most_common = words_set.intersection(stopwords_set)
percentage[language] = len(most_common)
most_probable = max(percentage, key=percentage.get)
if most_probable == "danish":
response = u'Hej! Taler du dansk?'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "dutch":
response = u'Hi! Groeten uit Holland.'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "english":
response = u'Hey! I speak some English.'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "finnish":
response = u'Hei! Terveisiä Suomi.'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "french":
response = u'Salut! Parlez-vous français?'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "german":
response = u'Hey! Sprichst du deutsch?'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "hungarian":
response = u'Hello! Beszélsz magyarul?'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "italian":
response = u'Ciao! Saluti da Italia.'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "norwegian":
response = u'Hei! Jeg snakker norsk.'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "portuguese":
response = u'Olá! Você fala português?'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "russian":
response = u'Привет! Привет из России.'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "spanish":
response = u'¡Hola! Saludos desde España.'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "swedish":
response = u'Hej! Talar du svenska?'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
if most_probable == "turkish":
response = u'Merhaba! Türkçe biliyor musun?'
prefixed = prefix + ' ' + response
self.post_tweet(prefixed, reply_to=tweet)
# print(percentage)
# print(stopwords_set)
# print(words_set)
print("The language of your text is %s" % most_probable)
def on_timeline(self, tweet, prefix):
pass
if __name__ == '__main__':
bot = Idiomaggio()
bot.run()
unfortunately I got stuck with the cmudict.dict - function of NLTK, while using this idea: http://h6o6.com/2013/03/using-python-and-the-nltk-to-find-haikus-in-the-public-twitter-stream/
so the result is not that complex… :(
grab the code at pastebin: http://pastebin.com/GvgyfDUU