From db1a8e05029db77bfe786abbbb92601d506a7049 Mon Sep 17 00:00:00 2001 From: ruidajo Date: Mon, 11 Apr 2022 14:18:52 -0500 Subject: [PATCH] Improving nitter user getting and parsing tweets. --- .../instance/collectRawTweetsForPages..st | 16 ++++++++ .../NitterUser.class/instance/documentTree.st | 2 +- .../instance/documentTreeFor..st | 3 ++ .../instance/numberOfURLsForLoadingTweets..st | 37 ------------------- .../instance/pageCursorFor..st | 10 +++++ .../instance/pageDocTrees..st | 14 +++++++ .../NitterUser.class/instance/pageURLs..st | 14 +++++++ .../instance/metricsFromNitterHtml..st | 4 +- .../TweetsCollection.class/instance/store.st | 6 ++- 9 files changed, 65 insertions(+), 41 deletions(-) create mode 100644 Socialmetrica.package/NitterUser.class/instance/collectRawTweetsForPages..st create mode 100644 Socialmetrica.package/NitterUser.class/instance/documentTreeFor..st delete mode 100644 Socialmetrica.package/NitterUser.class/instance/numberOfURLsForLoadingTweets..st create mode 100644 Socialmetrica.package/NitterUser.class/instance/pageCursorFor..st create mode 100644 Socialmetrica.package/NitterUser.class/instance/pageDocTrees..st create mode 100644 Socialmetrica.package/NitterUser.class/instance/pageURLs..st diff --git a/Socialmetrica.package/NitterUser.class/instance/collectRawTweetsForPages..st b/Socialmetrica.package/NitterUser.class/instance/collectRawTweetsForPages..st new file mode 100644 index 0000000..59b5494 --- /dev/null +++ b/Socialmetrica.package/NitterUser.class/instance/collectRawTweetsForPages..st @@ -0,0 +1,16 @@ +accessing +collectRawTweetsForPages: anInteger + + | pagesDict rawResponse response | + pagesDict := self pageURLs: anInteger. + rawResponse := OrderedCollection new. + pagesDict values do: [ :each | + rawResponse addAll: ((each xpath: '//div[@class="timeline-item "]') asOrderedCollection + collect: [ :xmlElement | xmlElement postCopy ]) + + ]. + response := TweetsCollection new. + response query: pagesDict keys. + response tweets: (rawResponse collect: [ :tweet | Tweet new fromNitterHtmlItem: tweet ]). + ^ response. + \ No newline at end of file diff --git a/Socialmetrica.package/NitterUser.class/instance/documentTree.st b/Socialmetrica.package/NitterUser.class/instance/documentTree.st index 86360ec..c4a2579 100644 --- a/Socialmetrica.package/NitterUser.class/instance/documentTree.st +++ b/Socialmetrica.package/NitterUser.class/instance/documentTree.st @@ -1,3 +1,3 @@ operation documentTree - ^ XMLHTMLParser parse: self userNameLink asUrl retrieveContents \ No newline at end of file + ^ self documentTreeFor: self userNameLink \ No newline at end of file diff --git a/Socialmetrica.package/NitterUser.class/instance/documentTreeFor..st b/Socialmetrica.package/NitterUser.class/instance/documentTreeFor..st new file mode 100644 index 0000000..d5c7b6c --- /dev/null +++ b/Socialmetrica.package/NitterUser.class/instance/documentTreeFor..st @@ -0,0 +1,3 @@ +accessing +documentTreeFor: anUrl + ^ XMLHTMLParser parse:anUrl asUrl retrieveContents diff --git a/Socialmetrica.package/NitterUser.class/instance/numberOfURLsForLoadingTweets..st b/Socialmetrica.package/NitterUser.class/instance/numberOfURLsForLoadingTweets..st deleted file mode 100644 index 574cc83..0000000 --- a/Socialmetrica.package/NitterUser.class/instance/numberOfURLsForLoadingTweets..st +++ /dev/null @@ -1,37 +0,0 @@ -accessing -numberOfURLsForLoadingTweets: number - - | collectionURLs count asURLs urlAndTweets | - number = 1 ifTrue: [ ^ self ]. - - urlAndTweets := OrderedDictionary new. - collectionURLs := { self userNameLink } asOrderedCollection. - - urlAndTweets at: 'tweets' put: self lastTweetsFromHtml. - - count := 1. - number - count timesRepeat: [ - | tempDoc docTree urlString | - tempDoc := XMLHTMLParser parse: - (collectionURLs at: count) asUrl retrieveContents. - - urlString := self userNameLink - , - ((tempDoc xPath: '//a[.="Load more"]') @ 'href') - stringValue. - docTree := XMLHTMLParser parse: urlString asUrl retrieveContents. - collectionURLs add: urlString. - - urlAndTweets - at: 'tweets-' , (urlString splitOn: 'cursor=') second - put: - ((docTree xpath: '//div[@class="timeline-item "]') - asOrderedCollection collect: [ :xmlElement | - xmlElement postCopy ]). - count := count + 1 ]. - - asURLs := collectionURLs collect: [ :string | string asUrl ]. - - urlAndTweets at: 'urls' put: asURLs. - - ^ urlAndTweets \ No newline at end of file diff --git a/Socialmetrica.package/NitterUser.class/instance/pageCursorFor..st b/Socialmetrica.package/NitterUser.class/instance/pageCursorFor..st new file mode 100644 index 0000000..d649c2f --- /dev/null +++ b/Socialmetrica.package/NitterUser.class/instance/pageCursorFor..st @@ -0,0 +1,10 @@ +accessing +pageCursorFor: anUrl + + | response value key | + response := Dictionary new. + value := self documentTreeFor: anUrl. + key := ((value xpath: '//a[.="Load more"]') @ 'href')stringValue. + ^ response + at: key put: value; + yourself \ No newline at end of file diff --git a/Socialmetrica.package/NitterUser.class/instance/pageDocTrees..st b/Socialmetrica.package/NitterUser.class/instance/pageDocTrees..st new file mode 100644 index 0000000..39a1b08 --- /dev/null +++ b/Socialmetrica.package/NitterUser.class/instance/pageDocTrees..st @@ -0,0 +1,14 @@ +accessing +pageDocTrees: anInteger + + | response nextPageLink previousPageLink | + + response := OrderedDictionary new. + previousPageLink := self userNameLink. + response add: previousPageLink. + anInteger - 1 timesRepeat: [ + nextPageLink := self userNameLink, (self pageCursorFor:previousPageLink) value. + response add: nextPageLink. + previousPageLink := nextPageLink + ]. + ^ response \ No newline at end of file diff --git a/Socialmetrica.package/NitterUser.class/instance/pageURLs..st b/Socialmetrica.package/NitterUser.class/instance/pageURLs..st new file mode 100644 index 0000000..462dc38 --- /dev/null +++ b/Socialmetrica.package/NitterUser.class/instance/pageURLs..st @@ -0,0 +1,14 @@ +accessing +pageURLs: anInteger + + | response nextPageLink previousPageLink dicTemp | + + response := OrderedDictionary new. + response at: self userNameLink put: self documentTree. + previousPageLink := self userNameLink. + anInteger - 1 timesRepeat: [ + nextPageLink := self userNameLink, (self pageCursorFor:previousPageLink) keys first. + response at: nextPageLink put: (self pageCursorFor:previousPageLink) values first. + previousPageLink := nextPageLink + ]. + ^ response \ No newline at end of file diff --git a/Socialmetrica.package/Tweet.class/instance/metricsFromNitterHtml..st b/Socialmetrica.package/Tweet.class/instance/metricsFromNitterHtml..st index 92ae422..d4d2ef1 100644 --- a/Socialmetrica.package/Tweet.class/instance/metricsFromNitterHtml..st +++ b/Socialmetrica.package/Tweet.class/instance/metricsFromNitterHtml..st @@ -24,7 +24,7 @@ metricsFromNitterHtml: xmlItem item asString includesSubstring: 'heart' ]) stringValue trimmed copyReplaceAll: ',' with: ''). - - metadata + + self metadata at: 'pinned' put: (xmlItem xpath: '//div[@class="pinned"]') stringValue trimmed = 'Pinned Tweet' \ No newline at end of file diff --git a/Socialmetrica.package/TweetsCollection.class/instance/store.st b/Socialmetrica.package/TweetsCollection.class/instance/store.st index 6cc1b0f..f6f2a60 100644 --- a/Socialmetrica.package/TweetsCollection.class/instance/store.st +++ b/Socialmetrica.package/TweetsCollection.class/instance/store.st @@ -1,4 +1,8 @@ accessing store ReStore isConnected ifFalse: [ self class storeDB]. "Starting the ReStore singleton." - self tweets do: [:each | ReStore evaluateAsTransaction: [ each store ] ]. \ No newline at end of file + self tweets do: [:each | ReStore evaluateAsTransaction: [ + each store. + "each user id isInDB? ifFalse: [ each user store ]" + ] + ]. \ No newline at end of file