Refactoring tweets scrapping.
This commit is contained in:
parent
742bb97446
commit
a61de2ecb7
@ -0,0 +1,33 @@
|
||||
accessing
|
||||
collectRawTweetsFrom: anUrl upToPage: anInteger
|
||||
|
||||
| pagesDict response customQuery |
|
||||
pagesDict := self getPagesContentsFrom: anUrl upTo: anInteger.
|
||||
response := TweetsCollection new.
|
||||
customQuery := Dictionary new
|
||||
at: 'parameters' put: pagesDict keys;
|
||||
at: 'date' put: DateAndTime now;
|
||||
yourself.
|
||||
response query: customQuery.
|
||||
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
|
||||
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
|
||||
collect: [ :xmlElement | xmlElement postCopy ].
|
||||
temp do: [ :tweet | | tempTweet |
|
||||
tempTweet := Tweet new fromNitterHtmlItem: tweet.
|
||||
tempTweet metadata
|
||||
at: DateAndTime now asString put: key;
|
||||
yourself.
|
||||
response add: tempTweet.
|
||||
]
|
||||
].
|
||||
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
|
||||
response messages doWithIndex: [ :tweet :i |
|
||||
| current previous |
|
||||
current := response messages at: i.
|
||||
i < response lastIndex ifTrue: [
|
||||
previous := response messages at: i + 1.
|
||||
current timelines
|
||||
at: self userName put: previous id;
|
||||
yourself ]].
|
||||
^ response.
|
||||
|
@ -1,33 +1,5 @@
|
||||
accessing
|
||||
collectRawTweetsUpToPage: anInteger
|
||||
|
||||
| pagesDict response customQuery |
|
||||
pagesDict := self getPagesContentsUpto: anInteger.
|
||||
response := TweetsCollection new.
|
||||
customQuery := Dictionary new
|
||||
at: 'parameters' put: pagesDict keys;
|
||||
at: 'date' put: DateAndTime now;
|
||||
yourself.
|
||||
response query: customQuery.
|
||||
pagesDict keysAndValuesDo: [ :key :rawTweets | | temp |
|
||||
temp := (rawTweets xpath: '//div[@class="timeline-item "]') asOrderedCollection
|
||||
collect: [ :xmlElement | xmlElement postCopy ].
|
||||
temp do: [ :tweet | | tempTweet |
|
||||
tempTweet := Tweet new fromNitterHtmlItem: tweet.
|
||||
tempTweet metadata
|
||||
at: DateAndTime now asString put: key;
|
||||
yourself.
|
||||
response add: tempTweet.
|
||||
]
|
||||
].
|
||||
response messages: (response messages select: [ :tweet | tweet isNotNil ]).
|
||||
response messages doWithIndex: [ :tweet :i |
|
||||
| current previous |
|
||||
current := response messages at: i.
|
||||
i < response lastIndex ifTrue: [
|
||||
previous := response messages at: i + 1.
|
||||
current timelines
|
||||
at: self userName put: previous id;
|
||||
yourself ]].
|
||||
^ response.
|
||||
^ self collectRawTweetsFrom: self userNameLinkWithReplies upToPage: anInteger
|
||||
|
@ -1,3 +1,3 @@
|
||||
operation
|
||||
documentTree
|
||||
^ self documentTreeFor: (self userNameLink, '/with_replies')
|
||||
^ self documentTreeFor: self userNameLinkWithReplies
|
@ -1,5 +1,5 @@
|
||||
accessing
|
||||
getPagesContentsFrom: anURL Upto: anInteger
|
||||
getPagesContentsFrom: anURL upTo: anInteger
|
||||
"I retroactively get all pages contents until a specified page number.
|
||||
|
||||
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
|
@ -1,4 +1,4 @@
|
||||
accessing
|
||||
getPagesContentsFromOldestUpto: anInteger
|
||||
|
||||
^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) Upto: anInteger
|
||||
^ self getPagesContentsFrom: ((self oldestTweet metadata select: [ :item | item isString and: [ item beginsWith: 'https://' ]]) values first) upTo: anInteger
|
@ -5,4 +5,4 @@ getPagesContentsUpto: anInteger
|
||||
TO DO: should this be splitted back to two methods, one getting the page urls and other its content?
|
||||
or do we always be getting the cursor urls and its contents all the time.
|
||||
[ ] Benchmark alternative approaches."
|
||||
^ self getPagesContentsFrom: (self userNameLink, '/with_replies') Upto: anInteger
|
||||
^ self getPagesContentsFrom: self userNameLinkWithReplies upTo: anInteger
|
@ -0,0 +1,3 @@
|
||||
operation
|
||||
userNameLinkWithReplies
|
||||
^ self userNameLink, '/with_replies'
|
Loading…
Reference in New Issue
Block a user