Wiki Downloader - Printable Version

Wiki Downloader - Printable Version

+- QB64 Phoenix Edition (https://staging.qb64phoenix.com)
+-- Forum: QB64 Rising (https://staging.qb64phoenix.com/forumdisplay.php?fid=1)
+--- Forum: Prolific Programmers (https://staging.qb64phoenix.com/forumdisplay.php?fid=26)
+---- Forum: SMcNeill (https://staging.qb64phoenix.com/forumdisplay.php?fid=29)
+---- Thread: Wiki Downloader (/showthread.php?tid=483)

Wiki Downloader - SMcNeill - 05-24-2022

Lots of folks are always asking about how to download the wiki for off-line use, and over the years we've always used tools such as HTTrack to download and copy the website to save a local copy. The problem with this is that the archive we get is often huge as BLEEP! The last wiki I downloaded and shared was an archive of over 1.1GB in size!

So, I wanted a much simpler alternative, and thus I came up with this little solution:

Code: (Select All)
$Console:Only

DefLng A-Z

Const HomePage$ = "https://qb64phoenix.com"

ReDim Shared PageNames(10000) As String

NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.

'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)

Print PageCount

t# = Timer: t$ = Time$

For i = 1 To PageCount

    Cls

    Print "Downloading... (Started at: "; t$; ")"

    FileName$ = Mid$(PageNames(i), _InStrRev(PageNames(i), "/") + 1) + ".HTML"

    FileName$ = CleanHTML(FileName$)

    Print i; "of"; PageCount, FileName$

    Download PageNames(i), FileName$

    _Display

Next

_AutoDisplay

Print "FINISHED!!  (Finsihed at: "; Time$; ")"

Print

Print Using "##,###.## seconds to download everything on this PC."; Timer - t#

Function CountPages (NumberOfPageLists)

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    For i = 1 To NumberOfPageLists

        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$

        Open file$ For Binary As #1

        l = LOF(1): t$ = Space$(l)

        Get #1, 1, t$

        Close #1

        start = InStr(t$, "<ul") 'skip down to the part of the page with the page listins

        finish = InStr(start, t$, "</ul") 'and we can quit parsing when we get down to this point

        p = start 'current position in file we're parsing

        Do Until p > finish

            p = InStr(p, t$, "<li><a href=") + 13

            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going

            p2 = InStr(p, t$, Chr$(34))

            count = count + 1

            PageNames(count) = Mid$(t$, p, p2 - p)

        Loop

    Next

    CountPages = count

    ReDim _Preserve PageNames(count) As String

End Function

Function DownloadPageLists

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    FileCount = 1

    CurrentFile$ = ""

    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download

    Do

        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$

        Download url$, file$

        url2$ = GetNextPage$(file$)

        P = InStr(url2$, "from=")

        If P = 0 Then Exit Do

        If Mid$(url2$, P + 5) > CurrentFile$ Then

            CurrentFile$ = Mid$(url2$, P + 5)

            FileCount = FileCount + 1

            url$ = url2$

        Else

            Exit Do

        End If

    Loop

    DownloadPageLists = FileCount

End Function

Function CleanHTML$ (OriginalText$)

    text$ = OriginalText$ 'don't corrupt incoming text

    Type ReplaceList

        original As String

        replacement As String

    End Type

    'Expandable HTML replacement system

    Dim HTML(200) As ReplaceList

    HTML(0).original = "&amp;": HTML(0).replacement = "&"

    '    HTML(1).original = "%24": HTML(1).replacement = "$"

    For i = 1 To 200

        HTML(i).original = "%" + Hex$(i + 16)

        HTML(i).replacement = Chr$(i + 16)

        'Print HTML(i).original, HTML(i).replacement

        'Sleep

    Next

    For i = 0 To UBound(HTML)

        Do

            P = InStr(text$, HTML(i).original)

            If P = 0 Then Exit Do

            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))

        Loop

    Next

    CleanHTML$ = text$

End Function

Sub Download (url$, outputFile$)

    url2$ = CleanHTML(url$)

    'Print "https://qb64phoenix.com/qb64wiki/index.php?title=Special:AllPages&from=KEY+n"

    'Print HomePage$ + url2$

    Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)

End Sub

Function GetNextPage$ (currentPage$)

    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"

    SpecialPageLink$ = "<a href="

    SpecialPageEndLink$ = Chr$(34) + " title"

    Open currentPage$ For Binary As #1

    l = LOF(1)

    t$ = Space$(l)

    Get #1, 1, t$

    Close

    sp = InStr(t$, SpecialPageDivClass$)

    If sp Then

        lp = InStr(sp, t$, SpecialPageLink$)

        If lp Then

            lp = lp + 9

            lp2 = InStr(lp, t$, SpecialPageEndLink$)

            link$ = Mid$(t$, lp, lp2 - lp)

            GetNextPage$ = CleanHTML(link$)

        End If

    End If

End Function

With all of 130 lines of code, we fetch and save the whole wiki to disk!

NOTE: If you run this, this will save over 600 files into the same directory where this program is ran from! I'd suggest saving it into its own directly and running it from there! Complaining over a messy QB64 folder after this warning wil only get me to laugh at you. Tongue

Now, this creates a whole bunch of files with a *.HTML extension to them. You can click on any of these files and open them in your web browser, but you should know in advance that the links between them aren't going to work as they expect a specific file structure on the wiki and we're not providing that here. You'll have to click and open each file individually yourself.

There's a little more work with working with these files, but there's also a couple of large advantages as well:

1) Anytime you run the program, you'll know you have the most up to date version of information from the wiki.
2) You don't have to wait for someone to run HTTrack, grab a copy from the web, and share it with you.
3) The total size of this on disk is about 20MB -- not 1.2GB!! It's a helluva lot more portable!

Pages aren't quite as pretty as what you'll find when you go to the wiki itself, as it doesn't have the templates to pull upon to format everything properly, but they're more than readable in my opinion. Below is the way the scancode page looks for me, in microsoft edge, as an example page.

Filename: Screenshot.png Size: 414.5 KB 05-24-2022, 11:39 AM

If there's anyone who just wants the files themselves, without having to download the pages on their own (as they take about 10 minutes or so on my machine to download all of them), I've zipped them all up in a 7z archive, which is available via the attachment below.

RE: Wiki Downloader - Statsman1 - 05-24-2022

That is awesome, Steve, thanks for this!

RE: Wiki Downloader - Coolman - 05-24-2022

very interesting and useful. thanks for sharing this code.

RE: Wiki Downloader - SMcNeill - 05-25-2022

An upgraded version 2.0 of this -- this now has the option to download readable HTML pages like what you got with the first version of this code, or you can toggle the flag and basically get the same text which is use for the IDE help system and stored in internal/help.

Code: (Select All)
$Console:Only

DefLng A-Z

Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download

Const HomePage$ = "https://qb64phoenix.com"

ReDim Shared PageNames(10000) As String

NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.

'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)

Print PageCount

t# = Timer: t$ = Time$

For i = 1 To PageCount

    Cls

    Print "Downloading... (Started at: "; t$; ")"

    FileName$ = Mid$(PageNames(i), _InStrRev(PageNames(i), "/") + 1)

    FileName2$ = CleanHTML(FileName$)

    Select Case FileName2$

        Case "*": FileName2$ = "Multiply"

        Case "/": FileName2$ = "Divide"

    End Select

    Print i; "of"; PageCount, FileName2$: _Display

    If DownloadMethod = 2 Then 'download as IDE text file

        url$ = "/qb64wiki/index.php?title=" + FileName$ + "&action=edit"

        Download url$, FileName2$ + ".HTML"

        HTMLtoText FileName2$ + ".HTML", FileName2$ + ".txt"

        'Kill FileName2$ + ".HTML"  'I'll leave this for now, for comparison and reference.  Remember to unremark it later.

    Else 'download as HTML file for browser viewing

        Download PageNames(i), FileName2$ + ".HTML"

    End If

Next

_AutoDisplay

Print "FINISHED!!  (Finsihed at: "; Time$; ")" + Chr$(13)

Print Using "##,###.## seconds to download everything on this PC."; Timer - t#

Function CountPages (NumberOfPageLists)

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    For i = 1 To NumberOfPageLists

        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$

        Open file$ For Binary As #1

        t$ = Space$(LOF(1)): Get #1, 1, t$

        Close #1

        Do

            p = InStr(p, t$, "<li><a href=") + 13

            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going

            p2 = InStr(p, t$, Chr$(34))

            count = count + 1

            PageNames(count) = Mid$(t$, p, p2 - p)

        Loop

    Next

    CountPages = count

    ReDim _Preserve PageNames(count) As String

End Function

Function DownloadPageLists

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    FileCount = 1

    CurrentFile$ = ""

    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download

    Do

        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$

        Download url$, file$

        url2$ = GetNextPage$(file$)

        P = InStr(url2$, "from=")

        If P = 0 Then Exit Do

        If Mid$(url2$, P + 5) > CurrentFile$ Then

            CurrentFile$ = Mid$(url2$, P + 5)

            FileCount = FileCount + 1

            url$ = url2$

        Else

            Exit Do

        End If

    Loop

    DownloadPageLists = FileCount

End Function

Function CleanHTML$ (OriginalText$)

    text$ = OriginalText$ 'don't corrupt incoming text

    Type ReplaceList

        original As String

        replacement As String

    End Type

    Dim HTML(255) As ReplaceList 'Expandable HTML replacement system

    HTML(0).original = "&amp;": HTML(0).replacement = "&"

    HTML(1).original = "&lt;": HTML(1).replacement = "<"

    HTML(2).original = "&gt;": HTML(2).replacement = ">"

    HTML(3).original = "&verbar;": HTML(3).replacement = "|"

    HTML(4).original = "&pi;": HTML(4).replacement = Chr$(227)

    HTML(5).original = "&theta;": HTML(5).replacement = Chr$(233)

    HTML(6).original = "&sup1;": HTML(6).replacement = Chr$(252)

    HTML(7).original = "&quot;": HTML(7).replacement = Chr$(34)

    HTML(8).original = "&sup2;": HTML(8).replacement = Chr$(253)

    HTML(9).original = "&nbsp;": HTML(9).replacement = Chr$(255)

    HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"

    HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"

    HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"

    HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"

    HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"

    HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"

    HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"

    For i = 17 To 255

        HTML(i).original = "%" + Hex$(i)

        HTML(i).replacement = Chr$(i)

    Next

    For i = 0 To UBound(HTML)

        Do

            P = InStr(text$, HTML(i).original)

            If P = 0 Then Exit Do

            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))

        Loop

    Next

    CleanHTML$ = text$

End Function

Sub Download (url$, outputFile$)

    url2$ = CleanHTML(url$)

    Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)

End Sub

Function GetNextPage$ (currentPage$)

    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"

    SpecialPageLink$ = "<a href="

    SpecialPageEndLink$ = Chr$(34) + " title"

    Open currentPage$ For Binary As #1

    l = LOF(1)

    t$ = Space$(l)

    Get #1, 1, t$

    Close

    sp = InStr(t$, SpecialPageDivClass$)

    If sp Then

        lp = InStr(sp, t$, SpecialPageLink$)

        If lp Then

            lp = lp + 9

            lp2 = InStr(lp, t$, SpecialPageEndLink$)

            link$ = Mid$(t$, lp, lp2 - lp)

            GetNextPage$ = CleanHTML(link$)

        End If

    End If

End Function

Sub HTMLtoText (inFile$, outFile$)

    Open inFile$ For Binary As #1

    t$ = Space$(LOF(1)): Get #1, 1, t$

    Close

    start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))

    finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)

    Open outFile$ For Output As #1

    Do

        a$ = Left$(t$, 8)

        If a$ = "<script>" Then

            i = InStr(t$, "</script>")

            t$ = Mid$(t$, i + 9)

        Else

            a$ = Left$(t$, 1)

            Select Case a$

                Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces

                Case "<": 'look for a leading <

                    i = InStr(t$, ">")

                    If i = 0 Then Print #1, CleanHTML(t$): Exit Do

                    skip$ = Left$(t$, 3)

                    Select Case skip$

                        Case "<br", "</p", "</l", "</d": Print #1, ""

                    End Select

                    t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets

                Case Else

                    i = InStr(t$, "<")

                    If i Then

                        Print #1, CleanHTML(Left$(t$, i - 1));

                        t$ = Mid$(t$, i)

                    Else

                        Print #1, CleanHTML(t$)

                        Exit Do

                    End If

            End Select

        End If

    Loop

    Close

End Sub

I may have overlooked a few small tweaks so that this will *perfectly* match the IDE format, but I don't think those will be very hard to sort out and adjust for, with the source as it stands now. Smile

Below is basically an image of how an IDE formatted help page looks like, for those interested:

RE: Wiki Downloader - zaadstra - 05-27-2022

I was just curious and HTTracked the Wiki.

It gave me a folder of 66 MB with 2084 files. The most important is that all links are fully working.

But - if you refresh the Help in QB64 you would get the same recent pages, wouldn't you?
Also offline Wink

Sidenote: how can we get Google to find the new Wiki? I usually do "QB64 <mycommand>" and this still gets the old (non-working) site.

RE: Wiki Downloader - SMcNeill - 05-31-2022

Update #3: I've been playing around with writing my own download routines, rather than relying on curl to do the job for me, and here's what I've came up with currently:

Code: (Select All)
$Console:Only

DefLng A-Z

Type ODL_Type

    site As String

    handle As Long

    contents As String

    baseURL As String

    path As String

End Type

Dim Shared CRLF As String: CRLF$ = Chr$(13) + Chr$(10)

Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download

Const HomePage$ = "http://qb64phoenix.com"

ReDim Shared PageNames(10000) As String

'NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.

NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)

Dim Shared MaxDownLoads As Long:

MaxDownLoads = 20

Dim Shared ODL(MaxDownLoads) As ODL_Type

t# = Timer: t1$ = Time$

For j = 0 To PageCount Step MaxDownLoads

    For i = 1 To MaxDownLoads 'OPEN ALL CONNECTIONS

        If j + i > UBound(PageNames) Then _Continue

        FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)

        url$ = "http://qb64phoenix.com/qb64wiki/" + FileName$

        If DownloadMethod = 2 Then url$ = "http://qb64phoenix.com/qb64wiki/index.php?title=" + FileName$ + "&action=edit"

        validHandle = OpenDownLoad(url$)

        'Print ODL(i).handle

    Next

    Cls

    Print "Downloading... (Started at: "; t1$; ")"

    Print "Curently on:"; j; " of "; PageCount

    Do

        finished = -1

        For i = 1 To MaxDownLoads 'DOWNLOAD ALL FILES

            If j + i > UBound(PageNames) Then _Continue

            If ODL(i).handle <> 0 Then

                t$ = GetDownloads(i)

                finished = 0

                If t$ <> "" Then

                    FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)

                    FileName2$ = CleanHTML(FileName$)

                    Select Case FileName2$

                        Case "*": FileName2$ = "Multiply"

                        Case "/": FileName2$ = "Divide"

                    End Select

                    f = FreeFile

                    Open FileName2$ + ".HTML" For Output As #f

                    Print #f, t$

                    Close f

                    If DownloadMethod = 2 Then HTMLtoText FileName2$ + ".HTML", FileName2$ + ".txt"

                End If

            End If

        Next

    Loop Until finished

Next

Print "FINISHED!!  (Finsihed at: "; Time$; ")" + Chr$(13)

Print Using "##,###.## seconds to download everything on this PC."; Timer - t#

Function CountPages (NumberOfPageLists)

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    For i = 1 To NumberOfPageLists

        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$

        Open file$ For Binary As #1

        t$ = Space$(LOF(1)): Get #1, 1, t$

        Close #1

        Do

            p = InStr(p, t$, "<li><a href=") + 13

            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going

            p2 = InStr(p, t$, Chr$(34))

            count = count + 1

            PageNames(count) = Mid$(t$, p, p2 - p)

        Loop

    Next

    CountPages = count

    ReDim _Preserve PageNames(count) As String

End Function

Function DownloadPageLists

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    FileCount = 1

    CurrentFile$ = ""

    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download

    Do

        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$

        Download url$, file$

        url2$ = GetNextPage$(file$)

        P = InStr(url2$, "from=")

        If P = 0 Then Exit Do

        If Mid$(url2$, P + 5) > CurrentFile$ Then

            CurrentFile$ = Mid$(url2$, P + 5)

            FileCount = FileCount + 1

            url$ = url2$

        Else

            Exit Do

        End If

    Loop

    DownloadPageLists = FileCount

End Function

Function CleanHTML$ (OriginalText$)

    text$ = OriginalText$ 'don't corrupt incoming text

    Type ReplaceList

        original As String

        replacement As String

    End Type

    Dim HTML(255) As ReplaceList 'Expandable HTML replacement system

    HTML(0).original = "&amp;": HTML(0).replacement = "&"

    HTML(1).original = "&lt;": HTML(1).replacement = "<"

    HTML(2).original = "&gt;": HTML(2).replacement = ">"

    HTML(3).original = "&verbar;": HTML(3).replacement = "|"

    HTML(4).original = "&pi;": HTML(4).replacement = Chr$(227)

    HTML(5).original = "&theta;": HTML(5).replacement = Chr$(233)

    HTML(6).original = "&sup1;": HTML(6).replacement = Chr$(252)

    HTML(7).original = "&quot;": HTML(7).replacement = Chr$(34)

    HTML(8).original = "&sup2;": HTML(8).replacement = Chr$(253)

    HTML(9).original = "&nbsp;": HTML(9).replacement = Chr$(255)

    HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"

    HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"

    HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"

    HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"

    HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"

    HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"

    HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"

    For i = 17 To 255

        HTML(i).original = "%" + Hex$(i)

        HTML(i).replacement = Chr$(i)

    Next

    For i = 0 To UBound(HTML)

        Do

            P = InStr(text$, HTML(i).original)

            If P = 0 Then Exit Do

            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))

        Loop

    Next

    CleanHTML$ = text$

End Function

Sub Download (url$, outputFile$)

    url2$ = CleanHTML(url$)

    Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)

End Sub

Function GetNextPage$ (currentPage$)

    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"

    SpecialPageLink$ = "<a href="

    SpecialPageEndLink$ = Chr$(34) + " title"

    Open currentPage$ For Binary As #1

    l = LOF(1)

    t$ = Space$(l)

    Get #1, 1, t$

    Close #1

    sp = InStr(t$, SpecialPageDivClass$)

    If sp Then

        lp = InStr(sp, t$, SpecialPageLink$)

        If lp Then

            lp = lp + 9

            lp2 = InStr(lp, t$, SpecialPageEndLink$)

            link$ = Mid$(t$, lp, lp2 - lp)

            GetNextPage$ = CleanHTML(link$)

        End If

    End If

End Function

Sub HTMLtoText (inFile$, outFile$)

    Open inFile$ For Binary As #1

    t$ = Space$(LOF(1)): Get #1, 1, t$

    Close #1

    start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))

    finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)

    Open outFile$ For Output As #1

    Do

        a$ = Left$(t$, 8)

        If a$ = "<script>" Then

            i = InStr(t$, "</script>")

            t$ = Mid$(t$, i + 9)

        Else

            a$ = Left$(t$, 1)

            Select Case a$

                Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces

                Case "<": 'look for a leading <

                    i = InStr(t$, ">")

                    If i = 0 Then Print #1, CleanHTML(t$): Exit Do

                    skip$ = Left$(t$, 3)

                    Select Case skip$

                        Case "<br", "</p", "</l", "</d": Print #1, ""

                    End Select

                    t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets

                Case Else

                    i = InStr(t$, "<")

                    If i Then

                        Print #1, CleanHTML(Left$(t$, i - 1));

                        t$ = Mid$(t$, i)

                    Else

                        Print #1, CleanHTML(t$)

                        Exit Do

                    End If

            End Select

        End If

    Loop

    Close #1

End Sub

Function OpenDownLoad (site$)

    For i = 1 To MaxDownLoads

        If ODL(i).site = site$ Then

            OpenDownLoad = ODL(i).handle

            Exit Function

        End If

    Next

    For i = 1 To MaxDownLoads

        If ODL(i).handle = 0 Then

            ODL(i).site = site$

            If Left$(UCase$(site$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this

            webpage$ = site$

            If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://

            p = InStr(webpage$, "/")

            If p = 0 Then Exit Function

            baseURL$ = Left$(webpage$, p - 1)

            path$ = Mid$(webpage$, p)

            ODL(i).handle = _OpenClient("TCP/IP:80:" + baseURL$)

            ODL(i).contents = ""

            'base is everything before the first /, path is everything else.

            'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com

            '             and the path would be /qb64wiki/index.php=Main_Page

            Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$

            Put #ODL(i).handle, , Request$

            Exit Function

        End If

    Next

    OpenDownLoad = 0

End Function

Function GetDownloads$ (i)

    ' Print i, ODL(i).handle

    If ODL(i).handle <> 0 Then

        Get #ODL(i).handle, , t$

        ODL(i).contents = ODL(i).contents + t$

        If InStr(t$, "</html>") Then

            Close ODL(i).handle

            ODL(i).handle = 0

            GetDownloads = ODL(i).contents

        End If

    End If

End Function

Function Download$ (toSite$)

    CRLF$ = Chr$(13) + Chr$(10)

    If Left$(UCase$(toSite$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this

    webpage$ = toSite$

    If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://

    p = InStr(webpage$, "/")

    If p = 0 Then Exit Function

    baseURL$ = Left$(webpage$, p - 1)

    path$ = Mid$(webpage$, p)

    OpenHandle = _OpenClient("TCP/IP:80:" + baseURL$)

    'base is everything before the first /, path is everything else.

    'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com

    '             and the path would be /qb64wiki/index.php=Main_Page

    Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$

    Put #OpenHandle, , Request$

    Do

        Get #OpenHandle, , t$

        tempDownload$ = tempDownload$ + t$

        _Limit 20

    Loop Until InStr(t$, "</html>")

    Close OpenHandle

    Download$ = tempDownload$

End Function

I'm now downloading webpages in large batches (currently set at 20 pages per pass), and as such, I've dropped my download time from 10 minutes down to about 70 seconds. This seems to be a much more efficient method for downloading the whole wiki at once for us! Wink

RE: Wiki Downloader - SMcNeill - 05-31-2022

And, after a little additional fiddling, I'm now generating 100% QB64 IDE help-formatting style pages.

Code: (Select All)
$Console:Only

DefLng A-Z

Type ODL_Type

    site As String

    handle As Long

    contents As String

    baseURL As String

    path As String

End Type

Dim Shared CRLF As String: CRLF$ = Chr$(13) + Chr$(10)

Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download

Const HomePage$ = "http://qb64phoenix.com"

ReDim Shared PageNames(10000) As String

NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.

'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)

Dim Shared MaxDownLoads As Long:

MaxDownLoads = 20

Dim Shared ODL(MaxDownLoads) As ODL_Type

t# = Timer: t1$ = Time$

For j = 0 To PageCount Step MaxDownLoads

    For i = 1 To MaxDownLoads 'OPEN ALL CONNECTIONS

        If j + i > UBound(PageNames) Then _Continue

        FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)

        url$ = "http://qb64phoenix.com/qb64wiki/" + FileName$

        If DownloadMethod = 2 Then url$ = "http://qb64phoenix.com/qb64wiki/index.php?title=" + FileName$ + "&action=edit"

        validHandle = OpenDownLoad(url$)

        'Print ODL(i).handle,

    Next

    Cls

    Print "Downloading... (Started at: "; t1$; ")"

    Print "Curently on:"; j; " of "; PageCount

    Do

        finished = -1

        For i = 1 To MaxDownLoads 'DOWNLOAD ALL FILES

            If j + i > UBound(PageNames) Then _Continue

            If ODL(i).handle <> 0 Then

                t$ = GetDownloads(i)

                finished = 0

                If t$ <> "" Then

                    FileName$ = Mid$(PageNames(i + j), _InStrRev(PageNames(i + j), "/") + 1)

                    PageName2$ = ""

                    For i1 = 1 To Len(FileName$)

                        c = Asc(FileName$, i1)

                        Select Case c

                            Case 32 '                                    '(space)

                                PageName2$ = PageName2$ + "_"

                            Case 34, 38, 42, 47, 58, 60, 62, 63, 92, 124 '("&*/:<>?\|)

                                PageName2$ = PageName2$ + "%" + Hex$(c)

                            Case Else

                                PageName2$ = PageName2$ + Chr$(c)

                        End Select

                    Next

                    f = FreeFile

                    Open PageName2$ + ".HTML" For Output As #f

                    Print #f, t$

                    Close f

                    If DownloadMethod = 2 Then

                        ext$ = Space$(Len(PageName2$))

                        For i1 = 1 To Len(PageName2$)

                            c = Asc(PageName2$, i1)

                            Select Case c

                                Case 65 To 90: Asc(ext$, i1) = 49 'upper = 1

                                Case 97 To 122: Asc(ext$, i1) = 48 'lower = 0

                                Case Else: Asc(ext$, i1) = c 'non-letter = take as is

                            End Select

                        Next

                        wikiSafeName$ = PageName2$ + "_" + ext$

                        HTMLtoText PageName2$ + ".HTML", wikiSafeName$ + ".txt"

                    End If

                End If

            End If

        Next

    Loop Until finished

Next

Print "FINISHED!!  (Finsihed at: "; Time$; ")" + Chr$(13)

Print Using "##,###.## seconds to download everything on this PC."; Timer - t#

Function CountPages (NumberOfPageLists)

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    For i = 1 To NumberOfPageLists

        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$

        Open file$ For Binary As #1

        t$ = Space$(LOF(1)): Get #1, 1, t$

        Close #1

        Do

            p = InStr(p, t$, "<li><a href=") + 13

            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going

            p2 = InStr(p, t$, Chr$(34))

            count = count + 1

            PageNames(count) = Mid$(t$, p, p2 - p)

        Loop

    Next

    CountPages = count

    ReDim _Preserve PageNames(count) As String

End Function

Function DownloadPageLists

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    FileCount = 1

    CurrentFile$ = ""

    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download

    Do

        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$

        Download url$, file$

        url2$ = GetNextPage$(file$)

        P = InStr(url2$, "from=")

        If P = 0 Then Exit Do

        If Mid$(url2$, P + 5) > CurrentFile$ Then

            CurrentFile$ = Mid$(url2$, P + 5)

            FileCount = FileCount + 1

            url$ = url2$

        Else

            Exit Do

        End If

    Loop

    DownloadPageLists = FileCount

    _Delay .25 'give the pages time to write to the drive before attempting to read them

End Function

Function CleanHTML$ (OriginalText$)

    text$ = OriginalText$ 'don't corrupt incoming text

    Type ReplaceList

        original As String

        replacement As String

    End Type

    Dim HTML(255) As ReplaceList 'Expandable HTML replacement system

    HTML(0).original = "&amp;": HTML(0).replacement = "&"

    HTML(1).original = "&lt;": HTML(1).replacement = "<"

    HTML(2).original = "&gt;": HTML(2).replacement = ">"

    HTML(3).original = "&verbar;": HTML(3).replacement = "|"

    HTML(4).original = "&pi;": HTML(4).replacement = Chr$(227)

    HTML(5).original = "&theta;": HTML(5).replacement = Chr$(233)

    HTML(6).original = "&sup1;": HTML(6).replacement = Chr$(252)

    HTML(7).original = "&quot;": HTML(7).replacement = Chr$(34)

    HTML(8).original = "&sup2;": HTML(8).replacement = Chr$(253)

    HTML(9).original = "&nbsp;": HTML(9).replacement = Chr$(255)

    HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"

    HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"

    HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"

    HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"

    HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"

    HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"

    HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"

    For i = 17 To 255

        HTML(i).original = "%" + Hex$(i)

        HTML(i).replacement = Chr$(i)

    Next

    For i = 0 To UBound(HTML)

        Do

            P = InStr(text$, HTML(i).original)

            If P = 0 Then Exit Do

            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))

        Loop

    Next

    CleanHTML$ = text$

End Function

Sub Download (url$, outputFile$)

    url2$ = CleanHTML(url$)

    Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)

End Sub

Function GetNextPage$ (currentPage$)

    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"

    SpecialPageLink$ = "<a href="

    SpecialPageEndLink$ = Chr$(34) + " title"

    Open currentPage$ For Binary As #1

    l = LOF(1)

    t$ = Space$(l)

    Get #1, 1, t$

    Close #1

    sp = InStr(t$, SpecialPageDivClass$)

    If sp Then

        lp = InStr(sp, t$, SpecialPageLink$)

        If lp Then

            lp = lp + 9

            lp2 = InStr(lp, t$, SpecialPageEndLink$)

            link$ = Mid$(t$, lp, lp2 - lp)

            GetNextPage$ = CleanHTML(link$)

        End If

    End If

End Function

Sub HTMLtoText (inFile$, outFile$)

    Open inFile$ For Binary As #1

    t$ = Space$(LOF(1)): Get #1, 1, t$

    Close #1

    start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))

    finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)

    Open outFile$ For Output As #1

    If DownloadMethod = 2 Then Print #1, "{{QBDLDATE:" + Date$ + "}}" + Chr$(10) + "{{QBDLTIME:" + Time$ + "}}" + Chr$(10)

    Do

        a$ = Left$(t$, 8)

        If a$ = "<script>" Then

            i = InStr(t$, "</script>")

            t$ = Mid$(t$, i + 9)

        Else

            a$ = Left$(t$, 1)

            Select Case a$

                Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces

                Case "<": 'look for a leading <

                    i = InStr(t$, ">")

                    If i = 0 Then Print #1, CleanHTML(t$): Exit Do

                    skip$ = Left$(t$, 3)

                    Select Case skip$

                        Case "<br", "</p", "</l", "</d": Print #1, ""

                    End Select

                    t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets

                Case Else

                    i = InStr(t$, "<")

                    If i Then

                        Print #1, CleanHTML(Left$(t$, i - 1));

                        t$ = Mid$(t$, i)

                    Else

                        Print #1, CleanHTML(t$)

                        Exit Do

                    End If

            End Select

        End If

    Loop

    Close #1

End Sub

Function OpenDownLoad (site$)

    For i = 1 To MaxDownLoads

        If ODL(i).site = site$ Then

            OpenDownLoad = ODL(i).handle

            Exit Function

        End If

    Next

    For i = 1 To MaxDownLoads

        If ODL(i).handle = 0 Then

            ODL(i).site = site$

            If Left$(UCase$(site$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this

            webpage$ = site$

            If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://

            p = InStr(webpage$, "/")

            If p = 0 Then Exit Function

            baseURL$ = Left$(webpage$, p - 1)

            path$ = Mid$(webpage$, p)

            ODL(i).handle = _OpenClient("TCP/IP:80:" + baseURL$)

            ODL(i).contents = ""

            'base is everything before the first /, path is everything else.

            'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com

            '             and the path would be /qb64wiki/index.php=Main_Page

            Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$

            Put #ODL(i).handle, , Request$

            Exit Function

        End If

    Next

    OpenDownLoad = 0

End Function

Function GetDownloads$ (PassedI)

    i = PassedI

    'Print i, ODL(i).handle

    If ODL(i).handle <> 0 Then

        Get #ODL(i).handle, , t$

        ODL(i).contents = ODL(i).contents + t$

        If InStr(t$, "</html>") Then

            Close ODL(i).handle

            ODL(i).handle = 0

            GetDownloads = ODL(i).contents

        End If

    End If

End Function

Function Download$ (toSite$)

    CRLF$ = Chr$(13) + Chr$(10)

    If Left$(UCase$(toSite$), 5) = "HTTPS" Then Exit Function 'can't open HTTPS pages like this

    webpage$ = toSite$

    If Left$(LCase$(webpage$), 7) = "http://" Then webpage$ = Mid$(webpage$, 8) 'trim http://

    p = InStr(webpage$, "/")

    If p = 0 Then Exit Function

    baseURL$ = Left$(webpage$, p - 1)

    path$ = Mid$(webpage$, p)

    OpenHandle = _OpenClient("TCP/IP:80:" + baseURL$)

    'base is everything before the first /, path is everything else.

    'for example: qb64phoenix.com/qb64wiki/index.php=Main_Page, our base is qb64phoenix.com

    '             and the path would be /qb64wiki/index.php=Main_Page

    Request$ = "GET " + path$ + " HTTP/1.1" + CRLF$ + "Host:" + baseURL$ + CRLF$ + CRLF$

    Put #OpenHandle, , Request$

    Do

        Get #OpenHandle, , t$

        tempDownload$ = tempDownload$ + t$

        _Limit 20

    Loop Until InStr(t$, "</html>")

    Close OpenHandle

    Download$ = tempDownload$

End Function

As you can see from the above, I'm using WinMerge to check to see if the files are comparable, and the only difference in what I'm producing here externally, and what QB64 produces internally, is the timestamp. (And I'm producing those pages in 70 seconds instead of 10 minutes, or so. Big Grin

)