Refactoring tweets scrapping.

This commit is contained in:
Offray Vladimir Luna Cárdenas 2022-05-15 08:23:21 -05:00
parent 742bb97446
commit a61de2ecb7
7 changed files with 41 additions and 33 deletions

View File

@ -0,0 +1,33 @@
accessing
collectRawTweetsFrom: anUrl upToPage: anInteger
| pagesDict response customQuery |
pagesDict := self getPagesContentsFrom: anUrl upTo: anInteger.
response := TweetsCollection new.
customQuery := Dictionary new
at: 'parameters' put: pagesDict keys;
at: 'date' put: DateAndTime now;
yourself.
response query: customQuery.
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
collect: [ :xmlElement | xmlElement postCopy ].
temp do: [ :tweet | | tempTweet |
tempTweet := Tweet new fromNitterHtmlItem: tweet.
tempTweet metadata
at: DateAndTime now asString put: key;
yourself.
response add: tempTweet.
]
].
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
response messages doWithIndex: [ :tweet :i |
| current previous |
current := response messages at: i.
i < response lastIndex ifTrue: [
previous := response messages at: i + 1.
current timelines
at: self userName put: previous id;
yourself ]].
^ response.

View File

@ -1,33 +1,5 @@
accessing
collectRawTweetsUpToPage: anInteger
| pagesDict response customQuery |
pagesDict := self getPagesContentsUpto: anInteger.
response := TweetsCollection new.
customQuery := Dictionary new
at: 'parameters' put: pagesDict keys;
at: 'date' put: DateAndTime now;
yourself.
response query: customQuery.
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
collect: [ :xmlElement | xmlElement postCopy ].
temp do: [ :tweet | | tempTweet |
tempTweet := Tweet new fromNitterHtmlItem: tweet.
tempTweet metadata
at: DateAndTime now asString put: key;
yourself.
response add: tempTweet.
]
].
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
response messages doWithIndex: [ :tweet :i |
| current previous |
current := response messages at: i.
i < response lastIndex ifTrue: [
previous := response messages at: i + 1.
current timelines
at: self userName put: previous id;
yourself ]].
^ response.
^ self collectRawTweetsFrom: self userNameLinkWithReplies upToPage: anInteger

View File

@ -1,3 +1,3 @@
operation
documentTree
^ self documentTreeFor: (self userNameLink, '/with_replies')
^ self documentTreeFor: self userNameLinkWithReplies

View File

@ -1,5 +1,5 @@
accessing
getPagesContentsFrom: anURL Upto: anInteger
getPagesContentsFrom: anURL upTo: anInteger
"I retroactively get all pages contents until a specified page number.
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?

View File

@ -1,4 +1,4 @@
accessing
getPagesContentsFromOldestUpto: anInteger
^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) Upto: anInteger
^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) upTo: anInteger

View File

@ -5,4 +5,4 @@ getPagesContentsUpto: anInteger
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
or do we always be getting the cursor urls and its contents all the time.
[ ] Benchmark alternative approaches."
^ self getPagesContentsFrom: (self userNameLink, '/with_replies') Upto: anInteger
^ self getPagesContentsFrom: self userNameLinkWithReplies upTo: anInteger

View File

@ -0,0 +1,3 @@
operation
userNameLinkWithReplies
^ self userNameLink, '/with_replies'