Files
discretemath-labs/LabPSA_3/networking.py
2022-12-26 15:26:02 +02:00

178 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import matplotlib.pyplot as plt
import nltk
from nltk import TweetTokenizer
tt = TweetTokenizer()
special_chars = "1234567890#@.=?\",”$%^&*(…[]):!><"
likes_per_id = {}
retweets_per_id = {}
filter_words = ["sure"]
month_and_noun = {'2020-10': 0, '2020-11': 0, '2020-12': 0,'2022-01': 0, '2022-02': 0, '2022-03': 0,'2022-11': 0}
words_tweet_tokenizer = []
words_set = []
nouns_counted = {}
word_count_not_case_sensitive = {}
proper_nouns_counted = {}
words_per_tweet = {}
special_words = []
word_count_dict = {}
hashtags = []
# Handles the tweet json, separates words into needed categories
with open("tweets.json", "r", encoding="utf-8") as tweetJson:
tweetJsonData = json.load(tweetJson)
for tweet in tweetJsonData:
tempWords = tt.tokenize(tweet["text"])
words_per_tweet[tweet["id"]] = tempWords
for word in tempWords:
if not any(c in special_chars for c in word) and len(word) > 1:
words_tweet_tokenizer.append(word)
if word not in word_count_not_case_sensitive:
word_count_not_case_sensitive[word] = 1
else:
word_count_not_case_sensitive[word] += 1
if word not in word_count_dict and word.lower() not in word_count_dict:
word_count_dict[word.lower()] = 1
else:
word_count_dict[word.lower()] += 1
if word[0] == '#':
hashtags.append(word)
if word not in words_set:
words_set.append(word)
else:
if len(word) > 1:
special_words.append(word)
# Outputs counted words
print("============")
print("Top 10 words")
print("============")
word_count_dict = dict(sorted(word_count_dict.items(), key=lambda item: item[1], reverse=True))
x = 1
for i in word_count_dict:
if x <= 10:
print(i, " ", word_count_dict[i])
x += 1
# This piece of shit code counts the number of a word per month
word = input("Write a word")
with open('tweets.json', 'r', encoding='utf-8') as tweet_json:
tweet_data = json.load(tweet_json)
for i in range(len(tweet_data)):
temp_msg = nltk.word_tokenize(tweet_data[i]["text"])
temp_date = nltk.word_tokenize(tweet_data[i]["created_at"])
for x in temp_msg:
if x == word or x.lower() == word:
month_and_noun[temp_date[0][:7]] += 1
x = list(month_and_noun.keys())
y = list(month_and_noun.values())
plt.bar(x, y, color='maroon', width=0.7)
plt.show()
# Counts nouns
for i in word_count_dict:
ans = nltk.pos_tag([i])[0][1]
if ans == 'NN' or ans == 'NNS' or ans == 'NNPS' or ans == 'NNP':
if i not in filter_words and not any(c in ["\'"] for c in i):
nouns_counted[i] = word_count_dict[i]
nouns_counted = dict(sorted(nouns_counted.items(), key=lambda item: item[1], reverse=True))
# Outputs counted nouns
print("============")
print("Top 10 nouns")
print("============")
x = 1
for i in nouns_counted:
if x <= 10:
print(i, " ", nouns_counted[i])
x += 1
# Counts proper nouns
for i in word_count_not_case_sensitive:
ans = nltk.pos_tag([i])[0][1]
if ans == 'NN' or ans == 'NNS' or ans == 'NNPS' or ans == 'NNP':
if i not in filter_words and not any(c in ["\'"] for c in i) and i[0].isupper():
proper_nouns_counted[i] = word_count_not_case_sensitive[i]
proper_nouns_counted = dict(sorted(proper_nouns_counted.items(), key=lambda item: item[1], reverse=True))
# Outputs proper nouns
print("===================")
print("Top 10 proper nouns")
print("===================")
x = 1
for i in proper_nouns_counted:
if x <= 10:
print(i, " ", proper_nouns_counted[i])
x += 1
# Counts likes and retweets for each tweet
with open('tweets.json', 'r', encoding='utf-8') as tweet_json:
tweet_data = json.load(tweet_json)
for tweet in tweet_data:
likes_per_id[tweet["id"]] = tweet["likes"]
retweets_per_id[tweet["id"]] = tweet["retweets"]
# Counts popularity by formula
popularity_nouns = {}
for noun in nouns_counted:
normLikes = 0
normRetweets = 0
for id in words_per_tweet:
if noun in words_per_tweet[id]:
normLikes += likes_per_id[id]
normRetweets += retweets_per_id[id]
popularity_nouns[noun] = word_count_dict[noun] * (1.4 * normRetweets) * (1.2 * normLikes)
popularity_nouns = dict(sorted(popularity_nouns.items(), key=lambda item: item[1], reverse=True))
# Outputs popularity nouns
print("==========================")
print("Top 10 nouns by popularity")
print("==========================")
x = 1
for i in popularity_nouns:
if x <= 10:
print(i, " ", popularity_nouns[i])
x += 1
# Suggestion stuff
word_sliced_count = {}
word = input("Write word for suggestion")
for x in word_count_dict:
if x not in word_sliced_count:
word_sliced_count[x] = 0
for x in word_count_dict:
if word == x[:len(word)] and word != x:
word_sliced_count[x] += word_count_dict[x]
word_sliced_count = dict(sorted(word_sliced_count.items(), key=lambda item: item[1], reverse=True))
print("===============")
print("Top suggestions")
print("===============")
x = 1
for i in word_sliced_count:
if x <= 10:
print(i, " ", word_sliced_count[i])
x += 1
# Suggestion occurrences stuff
word = input("Write word for suggestion")
words_suggestion_counted = {}
for id in words_per_tweet:
for i in range(len(words_per_tweet[id]) - 2):
if words_per_tweet[id][i] == word:
if words_per_tweet[id][i+1] not in words_suggestion_counted and len(words_per_tweet[id][i+1]) > 1:
words_suggestion_counted[words_per_tweet[id][i+1]] = 1
elif len(words_per_tweet[id][i+1]) > 1:
words_suggestion_counted[words_per_tweet[id][i + 1]] += 1
words_suggestion_counted = dict(sorted(words_suggestion_counted.items(), key=lambda item: item[1], reverse=True))
print("==========================")
print("Top suggestion occurrences")
print("==========================")
x = 1
for i in words_suggestion_counted:
if x <= 5:
print(i, " ", words_suggestion_counted[i])
x += 1