Grafoscopio/src/Dataviz/TweetsCollection.class.st

346 lines
12 KiB
Smalltalk

"
Please comment me using the following template inspired by Class Responsibility Collaborator (CRC) design:
I'm Tweets a helper class to work with collections of tweet objects.
For the Responsibility part: Three sentences about my main responsibility, what I'm doing, what services do I offer.
For the Collaborators Part: State my main collaborators and one line about how I interact with them.
Public API and Key Messages
- message one
- message two
- what is the way to create instances is a plus.
One simple example is simply gorgeous.
Internal Representation and Key Implementation Points.
Instance Variables
tweets: <Object>
Implementation Points
"
Class {
#name : #TweetsCollection,
#superclass : #Object,
#instVars : [
'tweets'
],
#category : #'Dataviz-Twitter'
}
{ #category : #'data visualization' }
TweetsCollection >> activityHistogramFor: aProfileName in: aDataBaseFile [
"I draw a histogram of the tweeter activity for a given profile name with data stored in aDataBaseFile.
The database stores the individual tweets for this profile, with their type (tweet, retweet or reply),
unique url and date.
A proper schema of the data base still needs to be published.
Is the one used in all references to aDataBaseFile."
| sample activityDataArray monthOfFirstTweet activityDataCollection histogramData plot |
sample := TweetsCollection new.
activityDataArray := sample monthlyActivityDataFor: aProfileName in: aDataBaseFile.
(activityDataArray size > 0)
ifFalse: [
self inform: 'There is no data for ', aProfileName, ' in the database'
]
ifTrue:[
"Detecting where happened the first tweet and storing only retweets over this value, will delete the
outliers which correspond to retweets of original tweets far away of the period of the sample"
monthOfFirstTweet := activityDataArray detect: [ :each | (each value at: 1) > 0].
activityDataCollection := OrderedCollection new.
activityDataArray do: [ :each |
(each key >= monthOfFirstTweet key) ifTrue: [
activityDataCollection add: {
each key asString .
each value at: 1 .
each value at: 2 .
each value at: 3 }
]
].
"This part was adapted from the awesome roassal examples"
plot := RTGrapher new.
histogramData := RTMultipleData new.
histogramData barShape color: Color green.
histogramData
points: activityDataCollection;
addMetric: #second;
addMetric: #third;
addMetric: #fourth.
"Horizontal text"
"d barChartWithBarCenteredTitle: #first."
"Rotated text with integer axis"
histogramData barChartUsing: (RTBarLabelFactory new label: #first; fontSize: 7).
plot add: histogramData.
plot axisY noDecimal.
^ plot
]
]
{ #category : #'data queries' }
TweetsCollection >> importTweetsFromJSONFile: aJSONFile [
"I import all the tweets for aJSONFile and convert them in tweets inside a TweetCollection"
| stream truncated jsonData currentTweet |
stream := aJSONFile readStream.
"We need to truncate the original file to quite the first line, which is the name of the exported array, so NeoJSONReader doesn't complain"
truncated := WriteStream on: String new.
stream contents lines allButFirstDo: [ :each | truncated nextPutAll: each ].
jsonData := NeoJSONReader fromString: truncated contents asString.
jsonData do: [:each |
currentTweet := Tweet new.
currentTweet
message: (each at: 'text');
profile: ((each at: 'user') at: 'screen_name');
date: ((each at: 'created_at') copyFrom: 1 to: 19) asDateAndTime.
"Detecting the kind of message and processing accordingly"
(each keys includes: 'in_reply_to_status_id')
ifTrue: [
currentTweet
type: 'reply';
url: '/', (currentTweet profile, '/status/', (each at: 'id_str'))]
ifFalse: [(each keys includes: 'retweeted_status')
ifTrue: [
currentTweet
type: 'retweet';
url: '/',
(((each at: 'retweeted_status') at: 'user') at: 'screen_name'),
'/status/',
((each at: 'retweeted_status') at: 'id_str').
]
ifFalse: [
currentTweet
type: 'tweet';
url: '/', (currentTweet profile, '/status/', (each at: 'id_str')) ]
].
"Detecting hashtags"
"(((each at: 'entities') at: 'hashtags') size > 0)
ifTrue: [
(each at: 'entities') at: 'hashtags'
]."
self tweets add: currentTweet.
].
]
{ #category : #'data queries' }
TweetsCollection >> loadTweetsFor: aProfileName from: aDataBaseFile [
"I select all the tweets for aProfileName in a given database"
| db queryResults temporalTweet |
"openning connection"
db := NBSQLite3Connection on: aDataBaseFile.
db open.
"Querying the data base"
queryResults := (db execute: 'select * from tweets where profile="',aProfileName,'";') rows.
db close.
queryResults do: [ :each |
temporalTweet := Tweet new.
temporalTweet
url: (each at: 'url');
date: (TimeStamp fromUnixTime: (each at: 'date')) asUTC;
type: (each at: 'type');
message: (each at: 'message');
profile: (each at: 'profile').
self tweets add: temporalTweet
].
]
{ #category : #'data queries' }
TweetsCollection >> monthlyActivityDataFor: aProfileName in: aDataBaseFile [
"I present a histogram of the tweets that differenciates tweets, retweets and replies,
for a given profile in a given SQLite database (for the moment I supposse that the profile exist there
and data base schema is correct)"
| db queryResults firstMonth lastMonth currentMonth activityCalendar monthOfFirstTweet |
"openning connection"
db := NBSQLite3Connection on: aDataBaseFile.
db open.
"Querying the data base"
db execute: 'create temporary table profile_tweets as select * from tweets where profile="',aProfileName,'";'.
queryResults := (db execute:
'SELECT strftime("%Y-%m",datetime(date, "unixepoch","localtime")) as month, type, count(*) as amount
FROM profile_tweets GROUP BY strftime("%Y-%m",datetime(date, "unixepoch","localtime")), type;') rows.
db execute: 'drop table if exists profile_tweets;'.
db close.
activityCalendar := Dictionary new.
(queryResults size > 0)
ifFalse: [
self inform: 'There is no data for that profile in the database'
]
ifTrue: [
"Detecting where happened the first tweet and storing only retweets over this value, will delete the
outliers which correspond to retweets of original tweets far away of the period of the sample"
monthOfFirstTweet := queryResults detect: [ :each | (each at: 'month') notNil].
firstMonth := ((monthOfFirstTweet at: 'month'), '-01') asDate asMonth.
lastMonth := ((queryResults last at: 'month'), '-01') asDate asMonth.
currentMonth := firstMonth.
[ currentMonth = (lastMonth next)]
whileFalse:[
activityCalendar at: (currentMonth) put: { 0 . 0 . 0 }.
currentMonth := currentMonth next.
].
queryResults do: [ :each |
(each at: 'type') = 'tweet' & ((each at: 'month') notNil) ifTrue: [
currentMonth := ((each at: 'month'), '-01') asDate asMonth.
(activityCalendar at: currentMonth)
at: 1
put: (((activityCalendar at: currentMonth) at: 1) + (each at: 'amount'))
].
(each at: 'type') = 'retweet' ifTrue: [
currentMonth := ((each at: 'month'), '-01') asDate asMonth.
(activityCalendar at: currentMonth)
at: 2
put: (((activityCalendar at: currentMonth) at: 2) + (each at: 'amount'))
].
(each at: 'type') = 'reply' ifTrue: [
currentMonth := ((each at: 'month'), '-01') asDate asMonth.
(activityCalendar at: currentMonth)
at: 3
put: (((activityCalendar at: currentMonth) at: 3) + (each at: 'amount'))
]
].
].
^ (activityCalendar associations sorted)
]
{ #category : #'data storage / persistance' }
TweetsCollection >> populateDataBase: aDataBaseFile [
"I populate a SQLite database file with my tweets data"
| db |
"openning connection"
db := NBSQLite3Connection on: aDataBaseFile.
db open.
"Creating the data base tweets schema"
db execute:
'create table if not exists tweets (
url text primary key,
profile text,
date integer,
type text,
message text
);'.
"Populating the database"
self tweets do: [:each |
db execute: 'INSERT INTO tweets values (?, ?, ?, ?, ?);'
with: {
each url .
each profile .
each date ifNotNil: [each date asUnixTime asString ].
each type .
each message }.
].
db close.
]
{ #category : #'data visualization' }
TweetsCollection >> ringOverview [
"I present a overview of the tweets as a ring that differenciates tweets, retweets and replies"
| totalTweets replies retweets ring |
replies := 0.
retweets := 0.
tweets do: [ :each |
(each type = 'reply') ifTrue: [replies := replies + 1].
(each type = 'retweets') ifTrue: [retweets := retweets + 1]].
totalTweets := (self tweets size) - replies - retweets.
ring := RTPieBuilder new.
ring interaction popup.
ring shape current
innerRadius: 80;
externalRadius: 100.
ring objects: {totalTweets . retweets . replies}.
(ring slice: #value)ifNotNil: [ :group |
group do: [:each | each @ (RTDraggable groupToDrag: group)]
].
ring normalizer distinctColor.
ring build.
^ ring.
]
{ #category : #'data visualization' }
TweetsCollection >> ringOverviewFor: aProfileName in: aDataBaseFile [
"I present a overview of the tweets as a ring that differenciates tweets, retweets and replies,
for a given profile in a given SQLite database (for the moment I supposse that the profile exist there
and data base schema is correct)"
| db totalTweets replies retweets ring |
"openning connection"
db := NBSQLite3Connection on: aDataBaseFile.
db open.
"Querying the data base"
retweets := (db execute: 'select * from tweets where profile="', aProfileName ,'" and type="retweet";') rows size.
replies := (db execute: 'select * from tweets where profile="', aProfileName ,'" and type="reply";') rows size.
totalTweets := (db execute: 'select * from tweets where profile="', aProfileName ,'";') rows size - retweets - replies.
db close.
(totalTweets > 0)
ifFalse: [
self inform: 'There are no tweets in the database for that profile'
]
ifTrue: [
ring := RTPieBuilder new.
ring interaction popup.
ring shape current
innerRadius: 80;
externalRadius: 100.
ring objects: {totalTweets . retweets . replies}.
(ring slice: #value)ifNotNil: [ :group |
group do: [:each | each @ (RTDraggable groupToDrag: group)]
].
ring normalizer distinctColor.
ring build.
^ ring].
]
{ #category : #'data scrapping' }
TweetsCollection >> scrapTweetsFromHtmlFile: aHtmlFile [
"I scraps tweets from a downloaded html file.
On how to download such file for any given public twitter profile look at:
http://blog.databigbang.com/scraping-web-sites-which-dynamically-load-data/
"
| tweetsDump htmlTree tweetsHtml tweet unixTime answersArray tweetsTemp profile |
tweetsDump := aHtmlFile readStream.
htmlTree := Soup fromString: tweetsDump contents.
profile := (((htmlTree findAllTagsByClass: 'ProfileHeaderCard-screennameLink') at: 1) attributeAt: 'href') copyReplaceAll: '/' with: ''.
tweetsHtml := htmlTree findAllTagsByClass: 'tweet'.
tweetsTemp := OrderedCollection new.
tweetsHtml allButLastDo: [:each |
tweet := Tweet new.
(each findAllTagsByClass: '_timestamp') size > 0
ifTrue: [
unixTime := (((each findAllTagsByClass: '_timestamp') at: 1) attributeAt: 'data-time') asInteger.
tweet date: (TimeStamp fromUnixTime: unixTime) asUTC
].
answersArray := each findAllTagsByClass: 'js-retweet-text'.
(answersArray size = 1)
ifTrue: [tweet type: 'retweet']
ifFalse: [
(each attributeAt: 'data-is-reply-to') isString
ifTrue: [tweet type: 'reply']
ifFalse: [tweet type: 'tweet']
].
tweet url: (each attributeAt: 'data-permalink-path').
(each findAllTagsByClass: 'TweetTextSize') size > 0
ifTrue: [tweet message: (((each findAllTagsByClass: 'TweetTextSize') at: 1) text)].
tweet profile: profile.
tweetsTemp add: tweet.
].
self tweets: tweetsTemp.
]
{ #category : #accessing }
TweetsCollection >> tweets [
^ tweets ifNil: [tweets := OrderedCollection new]
]
{ #category : #accessing }
TweetsCollection >> tweets: anOrderedCollection [
tweets := anOrderedCollection
]