Grafoscopio/src/Dataviz/TwitterMessages.class.st

478 lines
16 KiB
Smalltalk

"
Please comment me using the following template inspired by Class Responsibility Collaborator (CRC) design:
I'm Tweets a helper class to work with collections of tweet objects.
For the Responsibility part: Three sentences about my main responsibility, what I'm doing, what services do I offer.
For the Collaborators Part: State my main collaborators and one line about how I interact with them.
Public API and Key Messages
- message one
- message two
- what is the way to create instances is a plus.
One simple example is simply gorgeous.
Internal Representation and Key Implementation Points.
Instance Variables
tweets: <Object>
Implementation Points
"
Class {
#name : #TwitterMessages,
#superclass : #Object,
#instVars : [
'messages'
],
#category : #'Dataviz-Twitter'
}
{ #category : #'data visualization' }
TwitterMessages >> activityHistogramFor: aProfileName in: aDataBaseFile [
"I draw a histogram of the tweeter activity for a given profile name with data stored in
aDataBaseFile.
The database stores the individual tweets for this profile, with their type (tweet, retweet
or reply), unique url and date.
A proper schema of the data base still needs to be published.
Is the one used in all references to aDataBaseFile."
| sample activityDataArray monthOfFirstTweet activityDataCollection histogramData plot |
sample := TwitterMessages new.
activityDataArray := sample monthlyActivityDataFor: aProfileName in: aDataBaseFile.
(activityDataArray size > 0)
ifFalse: [
self inform: 'There is no data for ', aProfileName, ' in the database'
]
ifTrue:[
"Detecting where happened the first tweet and storing only retweets over this value, will delete the
outliers which correspond to retweets of original tweets far away of the period of the sample"
monthOfFirstTweet := activityDataArray detect: [ :each | (each value at: 1) > 0].
activityDataCollection := OrderedCollection new.
activityDataArray do: [ :each |
(each key >= monthOfFirstTweet key) ifTrue: [
activityDataCollection add: {
each key asString .
each value at: 1 .
each value at: 2 .
each value at: 3 }
]
].
"This part was adapted from the awesome roassal examples"
plot := RTGrapher new.
histogramData := RTMultipleData new.
histogramData barShape color: Color green.
histogramData
points: activityDataCollection;
addMetric: #second;
addMetric: #third;
addMetric: #fourth.
"Horizontal text"
"d barChartWithBarCenteredTitle: #first."
"Rotated text with integer axis"
histogramData barChartUsing: (RTBarLabelFactory new label: #first; fontSize: 7).
plot add: histogramData.
plot axisY noDecimal.
^ plot
]
]
{ #category : #'as yet unclassified' }
TwitterMessages >> ifEmpty: aBlockClosure [
self messages ifEmpty: aBlockClosure.
]
{ #category : #'data scrapping' }
TwitterMessages >> importFromJSONLocalFile: aFilePath [
"I import all the tweets for aJSONFile and convert them in tweets inside a TweetCollection"
| stream truncated jsonData currentTweet |
stream := aFilePath readStream.
"We need to truncate the original file to quite the first line, which is the name of the
exported array, so NeoJSONReader doesn't complain"
truncated := WriteStream on: String new.
stream contents lines allButFirstDo: [ :each | truncated nextPutAll: each ].
jsonData := NeoJSONReader fromString: truncated contents asString.
jsonData do: [:each |
currentTweet := Tweet new.
currentTweet
message: (each at: 'text');
profile: ((each at: 'user') at: 'screen_name');
date: ((each at: 'created_at') copyFrom: 1 to: 19) asDateAndTime.
currentTweet detectMessageTypeFrom: each.
currentTweet detectRetweetedProfileFrom: each.
currentTweet detectRepliedProfilesFrom: each.
"Detecting hashtags"
"(((each at: 'entities') at: 'hashtags') size > 0)
ifTrue: [
(each at: 'entities') at: 'hashtags'
]."
self messages add: currentTweet.
].
]
{ #category : #'data scrapping' }
TwitterMessages >> importFromJSONRemoteFile: aFileUrl [
"I import all the tweets for aFileUrl and convert them in tweets inside a TweetCollection"
| tweetsFile |
tweetsFile := FileLocator temp asFileReference / (aFileUrl splitOn: '/') last.
tweetsFile exists ifTrue: [ tweetsFile delete ].
ZnClient new
get: aFileUrl;
downloadTo: FileLocator temp asFileReference.
self importFromJSONLocalFile: tweetsFile.
]
{ #category : #'as yet unclassified' }
TwitterMessages >> isEmpty [
self messages isEmpty.
]
{ #category : #'data queries' }
TwitterMessages >> loadTweetsFor: aProfileName from: aDataBaseFile [
"I select all the tweets for aProfileName in a given database"
| db queryResults temporalTweet |
"openning connection"
db := UDBCSQLite3Connection on: aDataBaseFile.
db open.
"Querying the data base"
queryResults := (db execute: 'select * from tweets where profile="',aProfileName,'";') rows.
db close.
queryResults do: [ :each |
temporalTweet := Tweet new.
temporalTweet
url: (each at: 'url');
date: (TimeStampMethodConverter fromUnixTime: (each at: 'date')) asUTC;
type: (each at: 'type');
message: (each at: 'message');
profile: (each at: 'profile').
self tweets add: temporalTweet
].
]
{ #category : #'data queries' }
TwitterMessages >> mentionedProfilesByFrequencyUpTo: aPercentage [
"I shown the words by frequency stripped until a percentage of the total size of such words
is reached."
| totalSize copiedSize queryResults |
queryResults := self repliedProfilesByFrequency.
totalSize := queryResults size.
copiedSize := (totalSize * aPercentage / 100) floor.
^ queryResults copyFrom: 1 to: copiedSize.
]
{ #category : #accessing }
TwitterMessages >> messages [
^ messages ifNil: [messages := OrderedCollection new]
]
{ #category : #accessing }
TwitterMessages >> messages: anOrderedCollection [
messages := anOrderedCollection
]
{ #category : #'data queries' }
TwitterMessages >> monthlyActivityDataFor: aProfileName in: aDataBaseFile [
"I present a histogram of the tweets that differenciates tweets, retweets and replies,
for a given profile in a given SQLite database (for the moment I supposse that the profile exist there
and data base schema is correct)"
| db queryResults firstMonth lastMonth currentMonth activityCalendar monthOfFirstTweet |
"openning connection"
db := UDBCSQLite3Connection on: aDataBaseFile.
db open.
"Querying the data base"
db execute: 'create temporary table profile_tweets as select * from tweets where profile="',aProfileName,'";'.
queryResults := (db execute:
'SELECT strftime("%Y-%m",datetime(date, "unixepoch","localtime")) as month, type, count(*) as amount
FROM profile_tweets GROUP BY strftime("%Y-%m",datetime(date, "unixepoch","localtime")), type;') rows.
db execute: 'drop table if exists profile_tweets;'.
db close.
activityCalendar := Dictionary new.
(queryResults size > 0)
ifFalse: [
self inform: 'There is no data for that profile in the database'
]
ifTrue: [
"Detecting where happened the first tweet and storing only retweets over this value, will delete the
outliers which correspond to retweets of original tweets far away of the period of the sample"
monthOfFirstTweet := queryResults detect: [ :each | (each at: 'month') notNil].
firstMonth := ((monthOfFirstTweet at: 'month'), '-01') asDate asMonth.
lastMonth := ((queryResults last at: 'month'), '-01') asDate asMonth.
currentMonth := firstMonth.
[ currentMonth = (lastMonth next)]
whileFalse:[
activityCalendar at: (currentMonth) put: { 0 . 0 . 0 }.
currentMonth := currentMonth next.
].
queryResults do: [ :each |
(each at: 'type') = 'tweet' & ((each at: 'month') notNil) ifTrue: [
currentMonth := ((each at: 'month'), '-01') asDate asMonth.
(activityCalendar at: currentMonth)
at: 1
put: (((activityCalendar at: currentMonth) at: 1) + (each at: 'amount'))
].
(each at: 'type') = 'retweet' ifTrue: [
currentMonth := ((each at: 'month'), '-01') asDate asMonth.
(activityCalendar at: currentMonth)
at: 2
put: (((activityCalendar at: currentMonth) at: 2) + (each at: 'amount'))
].
(each at: 'type') = 'reply' ifTrue: [
currentMonth := ((each at: 'month'), '-01') asDate asMonth.
(activityCalendar at: currentMonth)
at: 3
put: (((activityCalendar at: currentMonth) at: 3) + (each at: 'amount'))
]
].
].
^ (activityCalendar associations sorted)
]
{ #category : #'data storage / persistance' }
TwitterMessages >> populateDataBase: aDataBaseFile [
"I populate a SQLite database file with my tweets data"
| db |
"openning connection"
db := UDBCSQLite3Connection on: aDataBaseFile.
db open.
"Creating the data base tweets schema"
db execute:
'create table if not exists tweets (
url text primary key,
profile text,
date integer,
type text,
message text
);'.
"Populating the database"
self tweets do: [:each |
db execute: 'INSERT INTO tweets values (?, ?, ?, ?, ?);'
with: {
each url .
each profile .
each date ifNotNil: [each date asUnixTime asString ].
each type .
each message }.
].
db close.
]
{ #category : #'as yet unclassified' }
TwitterMessages >> repliedProfilesByFrequency [
| words builder replies |
replies := (self splitByType at: 'replies') ifNil: [ ^ self ].
words := ''.
replies do: [ :message |
words := words, ' ', (message repliedUsers) ].
builder := RTNameCloud new.
builder dictionary: "RTEnglishDictionary new unnecessaryWords," self class new.
builder addString: words.
^ builder sortedAssociations
]
{ #category : #'data queries' }
TwitterMessages >> repliesSize [
^ self sizesByType at: 'repliesSize'.
]
{ #category : #'data queries' }
TwitterMessages >> retweetedProfilesByFrequency [
| words builder retweets |
retweets := (self splitByType at: 'retweets') ifNil: [ ^ self ].
words := ''.
retweets do: [ :message |
words := words, ' ', (message retweetedUser) ].
builder := RTNameCloud new.
builder dictionary: "RTEnglishDictionary new unnecessaryWords," self class new.
builder addString: words.
^ builder sortedAssociations
]
{ #category : #'data queries' }
TwitterMessages >> retweetedProfilesByFrequencyUpTo: aPercentage [
"I shown the words by frequency stripped until a percentage of the total size of such words
is reached."
| totalSize copiedSize queriedArray |
queriedArray := self retweetedProfilesByFrequency.
totalSize := queriedArray size.
copiedSize := (totalSize * aPercentage / 100) floor.
^ queriedArray copyFrom: 1 to: copiedSize.
]
{ #category : #'data queries' }
TwitterMessages >> retweetsSize [
^ self sizesByType at: 'retweetsSize'.
]
{ #category : #'data visualization' }
TwitterMessages >> ringOverview [
"I present a overview of the tweets as a ring that differenciates tweets, retweets and replies"
| totalTweets replies retweets ring |
replies := 0.
retweets := 0.
messages do: [ :each |
(each type = 'reply') ifTrue: [replies := replies + 1].
(each type = 'retweets') ifTrue: [retweets := retweets + 1]].
totalTweets := (self tweets size) - replies - retweets.
ring := RTPieBuilder new.
ring interaction popup.
ring shape current
innerRadius: 80;
externalRadius: 100.
ring objects: {totalTweets . retweets . replies}.
(ring slice: #value)ifNotNil: [ :group |
group do: [:each | each @ (RTDraggable groupToDrag: group)]
].
ring normalizer distinctColor.
ring build.
^ ring.
]
{ #category : #'data visualization' }
TwitterMessages >> ringOverviewFor: aProfileName in: aDataBaseFile [
"I present a overview of the tweets as a ring that differenciates tweets, retweets and replies,
for a given profile in a given SQLite database (for the moment I supposse that the profile exist there
and data base schema is correct)"
| db totalTweets replies retweets ring |
"openning connection"
db := NBSQLite3Connection on: aDataBaseFile.
db open.
"Querying the data base"
retweets := (db execute: 'select * from tweets where profile="', aProfileName ,'" and type="retweet";') rows size.
replies := (db execute: 'select * from tweets where profile="', aProfileName ,'" and type="reply";') rows size.
totalTweets := (db execute: 'select * from tweets where profile="', aProfileName ,'";') rows size - retweets - replies.
db close.
(totalTweets > 0)
ifFalse: [
self inform: 'There are no tweets in the database for that profile'
]
ifTrue: [
ring := RTPieBuilder new.
ring interaction popup.
ring shape current
innerRadius: 80;
externalRadius: 100.
ring objects: {totalTweets . retweets . replies}.
(ring slice: #value)ifNotNil: [ :group |
group do: [:each | each @ (RTDraggable groupToDrag: group)]
].
ring normalizer distinctColor.
ring build.
^ ring].
]
{ #category : #'data scrapping' }
TwitterMessages >> scrapTweetsFromHtmlFile: aHtmlFile [
"I scraps tweets from a downloaded html file.
On how to download such file for any given public twitter profile look at:
http://blog.databigbang.com/scraping-web-sites-which-dynamically-load-data/
"
| tweetsDump htmlTree tweetsHtml tweet unixTime answersArray tweetsTemp profile |
tweetsDump := aHtmlFile readStream.
htmlTree := Soup fromString: tweetsDump contents.
profile := (((htmlTree findAllTagsByClass: 'ProfileHeaderCard-screennameLink') at: 1) attributeAt: 'href') copyReplaceAll: '/' with: ''.
tweetsHtml := htmlTree findAllTagsByClass: 'tweet'.
tweetsTemp := OrderedCollection new.
tweetsHtml allButLastDo: [:each |
tweet := Tweet new.
(each findAllTagsByClass: '_timestamp') size > 0
ifTrue: [
unixTime := (((each findAllTagsByClass: '_timestamp') at: 1) attributeAt: 'data-time') asInteger.
tweet date: (TimeStamp fromUnixTime: unixTime) asUTC
].
answersArray := each findAllTagsByClass: 'js-retweet-text'.
(answersArray size = 1)
ifTrue: [tweet type: 'retweet']
ifFalse: [
(each attributeAt: 'data-is-reply-to') isString
ifTrue: [tweet type: 'reply']
ifFalse: [tweet type: 'tweet']
].
tweet url: (each attributeAt: 'data-permalink-path').
(each findAllTagsByClass: 'TweetTextSize') size > 0
ifTrue: [tweet message: (((each findAllTagsByClass: 'TweetTextSize') at: 1) text)].
tweet profile: profile.
tweetsTemp add: tweet.
].
self tweets: tweetsTemp.
]
{ #category : #'data queries' }
TwitterMessages >> sizesByType [
"I split the messages making differences between tweets, retweets and replies"
| splitted |
splitted := self splitByType.
^ Dictionary new
at: 'tweetsSize' put: (splitted at: 'tweets') size;
at: 'retweetsSize' put: (splitted at: 'retweets') size;
at: 'repliesSize' put: (splitted at: 'replies') size;
yourself
]
{ #category : #'data queries' }
TwitterMessages >> splitByType [
"I split the messages making differences between tweets, retweets and replies"
| retweets replies tweets |
retweets := self messages select: [ :message | message type = 'retweet' ].
replies := self messages select: [ :message | message type = 'reply' ].
tweets := self messages copyWithoutAll: (retweets, replies ).
^ Dictionary new
at: 'tweets' put: tweets;
at: 'retweets' put: retweets;
at: 'replies' put: replies;
yourself
]
{ #category : #utility }
TwitterMessages >> sumSplittedSizes [
^ self splitByType sum: [ :each | each size ]
]
{ #category : #'data queries' }
TwitterMessages >> tweetsSize [
^ self sizesByType at: 'tweetsSize'.
]
{ #category : #utility }
TwitterMessages >> unnecessaryWords [
^ #('a' 'amp' 'ante' 'así' 'cc' 'con' 'como' 'cuando' 'de' 'del' 'dentro' 'desde' 'el' 'en' 'En' 'es' 'está' 'ha' 'han' 'hay' 'la' 'La' 'las' 'Las' 'lo' 'los' 'más' 'mi' 'ni' 'No' 'nos' 'o' 'On' 'p' 'para' 'Para' 'por' 'q' 'que' 'quienes' 'quieres' 'RT' 'se' 'sea' 'Si' 'sin' 'son' 'su' 'sus' 'tan' 'tu' 'un' 'una' 'vía' 'we' 'y' 'yo' 'z')
, RTEnglishDictionary new unnecessaryWords
]
{ #category : #'data queries' }
TwitterMessages >> wordsByFrequencyInTweets [
| words builder |
self messages ifNil: [ ^ self ].
words := ''.
self messages do: [ :tweet |
words := words, ' ', (tweet message) ].
builder := RTNameCloud new.
builder dictionary: self class new.
builder addString: words.
^ builder sortedAssociations
]
{ #category : #'data queries' }
TwitterMessages >> wordsByFrequencyInTweetsUpTo: aPercentage [
"I shown the words by frequency stripped until a percentage of the total size of such words
is reached."
| totalSize copiedSize queryResults |
queryResults := self wordsByFrequencyInTweets.
totalSize := queryResults size.
copiedSize := (totalSize * aPercentage / 100) floor.
^ queryResults copyFrom: 1 to: copiedSize.
]