05-31-2022, 02:42 AM
Update #3: I've been playing around with writing my own download routines, rather than relying on curl to do the job for me, and here's what I've came up with currently:
I'm now downloading webpages in large batches (currently set at 20 pages per pass), and as such, I've dropped my download time from 10 minutes down to about 70 seconds. This seems to be a much more efficient method for downloading the whole wiki at once for us!
Code: (Select All)
DefLng A-Z
Type ODL_Type
site As String
handle As Long
contents As String
baseURL As String
path As String
End Type
Dim Shared CRLF As String: CRLF$ = Chr$(13) + Chr$(10)
Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download
Const HomePage$ = "http://qb64phoenix.com"
ReDim Shared PageNames(10000) As String
'NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.
NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code
PageCount = CountPages(NumberOfPageLists)
Dim Shared MaxDownLoads As Long:
MaxDownLoads = 20
Dim Shared ODL(MaxDownLoads) As ODL_Type
t# = Timer: t1$ = Time$
For j = 0 To PageCount Step MaxDownLoads
For i = 1 To MaxDownLoads 'OPEN ALL CONNECTIONS
If j + i > UBound(PageNames) Then _Continue
FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)
url$ = "http://qb64phoenix.com/qb64wiki/" + FileName$
If DownloadMethod = 2 Then url$ = "http://qb64phoenix.com/qb64wiki/index.php?title=" + FileName$ + "&action=edit"
validHandle = OpenDownLoad(url$)
'Print ODL(i).handle
Print "Downloading... (Started at: "; t1$; ")"
Print "Curently on:"; j; " of "; PageCount
finished = -1
For i = 1 To MaxDownLoads 'DOWNLOAD ALL FILES
If j + i > UBound(PageNames) Then _Continue
If ODL(i).handle <> 0 Then
t$ = GetDownloads(i)
finished = 0
If t$ <> "" Then
FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)
FileName2$ = CleanHTML(FileName$)
Select Case FileName2$
Case "*": FileName2$ = "Multiply"
Case "/": FileName2$ = "Divide"
End Select
f = FreeFile
Open FileName2$ + ".HTML" For Output As #f
Print #f, t$
Close f
If DownloadMethod = 2 Then HTMLtoText FileName2$ + ".HTML", FileName2$ + ".txt"
End If
End If
Loop Until finished
Print "FINISHED!! (Finsihed at: "; Time$; ")" + Chr$(13)
Print Using "##,###.## seconds to download everything on this PC."; Timer - t#
Function CountPages (NumberOfPageLists)
FileLeft$ = "Page List("
FileRight$ = ").txt"
For i = 1 To NumberOfPageLists
file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$
Open file$ For Binary As #1
t$ = Space$(LOF(1)): Get #1, 1, t$
Close #1
p = InStr(p, t$, "<li><a href=") + 13
If p = 13 Then Exit Do 'we've parsed all the lists from the page. No need to keep going
p2 = InStr(p, t$, Chr$(34))
count = count + 1
PageNames(count) = Mid$(t$, p, p2 - p)
CountPages = count
ReDim _Preserve PageNames(count) As String
End Function
Function DownloadPageLists
FileLeft$ = "Page List("
FileRight$ = ").txt"
FileCount = 1
CurrentFile$ = ""
url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download
file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$
Download url$, file$
url2$ = GetNextPage$(file$)
P = InStr(url2$, "from=")
If P = 0 Then Exit Do
If Mid$(url2$, P + 5) > CurrentFile$ Then
CurrentFile$ = Mid$(url2$, P + 5)
FileCount = FileCount + 1
url$ = url2$
Exit Do
End If
DownloadPageLists = FileCount
End Function
Function CleanHTML$ (OriginalText$)
text$ = OriginalText$ 'don't corrupt incoming text
Type ReplaceList
original As String
replacement As String
End Type
Dim HTML(255) As ReplaceList 'Expandable HTML replacement system
HTML(0).original = "&": HTML(0).replacement = "&"
HTML(1).original = "<": HTML(1).replacement = "<"
HTML(2).original = ">": HTML(2).replacement = ">"
HTML(3).original = "|": HTML(3).replacement = "|"
HTML(4).original = "π": HTML(4).replacement = Chr$(227)
HTML(5).original = "θ": HTML(5).replacement = Chr$(233)
HTML(6).original = "¹": HTML(6).replacement = Chr$(252)
HTML(7).original = """: HTML(7).replacement = Chr$(34)
HTML(8).original = "²": HTML(8).replacement = Chr$(253)
HTML(9).original = " ": HTML(9).replacement = Chr$(255)
HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"
HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"
HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"
HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"
HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"
HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"
HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"
For i = 17 To 255
HTML(i).original = "%" + Hex$(i)
HTML(i).replacement = Chr$(i)
For i = 0 To UBound(HTML)
P = InStr(text$, HTML(i).original)
If P = 0 Then Exit Do
text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))
CleanHTML$ = text$
End Function
Sub Download (url$, outputFile$)
url2$ = CleanHTML(url$)
Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)
End Sub
Function GetNextPage$ (currentPage$)
SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"
SpecialPageLink$ = "<a href="
SpecialPageEndLink$ = Chr$(34) + " title"
Open currentPage$ For Binary As #1
l = LOF(1)
t$ = Space$(l)
Get #1, 1, t$
Close #1
sp = InStr(t$, SpecialPageDivClass$)
If sp Then
lp = InStr(sp, t$, SpecialPageLink$)
If lp Then
lp = lp + 9
lp2 = InStr(lp, t$, SpecialPageEndLink$)
link$ = Mid$(t$, lp, lp2 - lp)
GetNextPage$ = CleanHTML(link$)
End If
End If
End Function
Sub HTMLtoText (inFile$, outFile$)
Open inFile$ For Binary As #1
t$ = Space$(LOF(1)): Get #1, 1, t$
Close #1
start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))
finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)
Open outFile$ For Output As #1
a$ = Left$(t$, 8)
If a$ = "<script>" Then
i = InStr(t$, "</script>")
t$ = Mid$(t$, i + 9)
a$ = Left$(t$, 1)
Select Case a$
Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces
Case "<": 'look for a leading <
i = InStr(t$, ">")
If i = 0 Then Print #1, CleanHTML(t$): Exit Do
skip$ = Left$(t$, 3)
Select Case skip$
Case "<br", "</p", "</l", "</d": Print #1, ""
End Select
t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets
Case Else
i = InStr(t$, "<")
If i Then
Print #1, CleanHTML(Left$(t$, i - 1));
t$ = Mid$(t$, i)
Print #1, CleanHTML(t$)
Exit Do
End If
End Select
End If
Close #1
End Sub
Function OpenDownLoad (site$)
For i = 1 To MaxDownLoads
If ODL(i).site = site$ Then
OpenDownLoad = ODL(i).handle
Exit Function
End If
For i = 1 To MaxDownLoads
If ODL(i).handle = 0 Then
ODL(i).site = site$
If Left$(UCase$(site$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this
webpage$ = site$
If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://
p = InStr(webpage$, "/")
If p = 0 Then Exit Function
baseURL$ = Left$(webpage$, p - 1)
path$ = Mid$(webpage$, p)
ODL(i).handle = _OpenClient("TCP/IP:80:" + baseURL$)
ODL(i).contents = ""
'base is everything before the first /, path is everything else.
'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com
' and the path would be /qb64wiki/index.php=Main_Page
Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$
Put #ODL(i).handle, , Request$
Exit Function
End If
OpenDownLoad = 0
End Function
Function GetDownloads$ (i)
' Print i, ODL(i).handle
If ODL(i).handle <> 0 Then
Get #ODL(i).handle, , t$
ODL(i).contents = ODL(i).contents + t$
If InStr(t$, "</html>") Then
Close ODL(i).handle
ODL(i).handle = 0
GetDownloads = ODL(i).contents
End If
End If
End Function
Function Download$ (toSite$)
CRLF$ = Chr$(13) + Chr$(10)
If Left$(UCase$(toSite$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this
webpage$ = toSite$
If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://
p = InStr(webpage$, "/")
If p = 0 Then Exit Function
baseURL$ = Left$(webpage$, p - 1)
path$ = Mid$(webpage$, p)
OpenHandle = _OpenClient("TCP/IP:80:" + baseURL$)
'base is everything before the first /, path is everything else.
'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com
' and the path would be /qb64wiki/index.php=Main_Page
Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$
Put #OpenHandle, , Request$
Get #OpenHandle, , t$
tempDownload$ = tempDownload$ + t$
_Limit 20
Loop Until InStr(t$, "</html>")
Close OpenHandle
Download$ = tempDownload$
End Function
