Login

**SMcNeill** · 05-25-2022, 12:09 AM

An upgraded version 2.0 of this -- this now has the option to download readable HTML pages like what you got with the first version of this code, or you can toggle the flag and basically get the same text which is use for the IDE help system and stored in internal/help.

Code: (Select All)
$Console:Only

DefLng A-Z

Const DownloadMethod = 2 '1 = HTML download, 2 = IDE text download

Const HomePage$ = "https://qb64phoenix.com"

ReDim Shared PageNames(10000) As String

NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.

'NumberOfPageLists = 2 'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)

Print PageCount

t# = Timer: t$ = Time$

For i = 1 To PageCount

    Cls

    Print "Downloading... (Started at: "; t$; ")"

    FileName$ = Mid$(PageNames(i), _InStrRev(PageNames(i), "/") + 1)

    FileName2$ = CleanHTML(FileName$)

    Select Case FileName2$

        Case "*": FileName2$ = "Multiply"

        Case "/": FileName2$ = "Divide"

    End Select

    Print i; "of"; PageCount, FileName2$: _Display

    If DownloadMethod = 2 Then 'download as IDE text file

        url$ = "/qb64wiki/index.php?title=" + FileName$ + "&action=edit"

        Download url$, FileName2$ + ".HTML"

        HTMLtoText FileName2$ + ".HTML", FileName2$ + ".txt"

        'Kill FileName2$ + ".HTML"  'I'll leave this for now, for comparison and reference.  Remember to unremark it later.

    Else 'download as HTML file for browser viewing

        Download PageNames(i), FileName2$ + ".HTML"

    End If

Next

_AutoDisplay

Print "FINISHED!!  (Finsihed at: "; Time$; ")" + Chr$(13)

Print Using "##,###.## seconds to download everything on this PC."; Timer - t#

Function CountPages (NumberOfPageLists)

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    For i = 1 To NumberOfPageLists

        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$

        Open file$ For Binary As #1

        t$ = Space$(LOF(1)): Get #1, 1, t$

        Close #1

        Do

            p = InStr(p, t$, "<li><a href=") + 13

            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going

            p2 = InStr(p, t$, Chr$(34))

            count = count + 1

            PageNames(count) = Mid$(t$, p, p2 - p)

        Loop

    Next

    CountPages = count

    ReDim _Preserve PageNames(count) As String

End Function

Function DownloadPageLists

    FileLeft$ = "Page List("

    FileRight$ = ").txt"

    FileCount = 1

    CurrentFile$ = ""

    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download

    Do

        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$

        Download url$, file$

        url2$ = GetNextPage$(file$)

        P = InStr(url2$, "from=")

        If P = 0 Then Exit Do

        If Mid$(url2$, P + 5) > CurrentFile$ Then

            CurrentFile$ = Mid$(url2$, P + 5)

            FileCount = FileCount + 1

            url$ = url2$

        Else

            Exit Do

        End If

    Loop

    DownloadPageLists = FileCount

End Function

Function CleanHTML$ (OriginalText$)

    text$ = OriginalText$ 'don't corrupt incoming text

    Type ReplaceList

        original As String

        replacement As String

    End Type

    Dim HTML(255) As ReplaceList 'Expandable HTML replacement system

    HTML(0).original = "&amp;": HTML(0).replacement = "&"

    HTML(1).original = "&lt;": HTML(1).replacement = "<"

    HTML(2).original = "&gt;": HTML(2).replacement = ">"

    HTML(3).original = "&verbar;": HTML(3).replacement = "|"

    HTML(4).original = "&pi;": HTML(4).replacement = Chr$(227)

    HTML(5).original = "&theta;": HTML(5).replacement = Chr$(233)

    HTML(6).original = "&sup1;": HTML(6).replacement = Chr$(252)

    HTML(7).original = "&quot;": HTML(7).replacement = Chr$(34)

    HTML(8).original = "&sup2;": HTML(8).replacement = Chr$(253)

    HTML(9).original = "&nbsp;": HTML(9).replacement = Chr$(255)

    HTML(10).original = "Start}}'' ''": HTML(10).replacement = "Start}}"

    HTML(11).original = "Start}} '' ''": HTML(11).replacement = "Start}}"

    HTML(12).original = "Start}}" + Chr$(10) + "'' ''": HTML(12).replacement = "Start}}"

    HTML(13).original = "'' ''" + Chr$(10) + "{{": HTML(13).replacement = Chr$(10) + "{{"

    HTML(14).original = "'' '' " + Chr$(10) + "{{": HTML(14).replacement = Chr$(10) + "{{"

    HTML(15).original = "'' ''" + MKI$(&H0A0A) + "{{": HTML(15).replacement = Chr$(10) + "{{"

    HTML(16).original = "#REDIRECT": HTML(16).replacement = "See page"

    For i = 17 To 255

        HTML(i).original = "%" + Hex$(i)

        HTML(i).replacement = Chr$(i)

    Next

    For i = 0 To UBound(HTML)

        Do

            P = InStr(text$, HTML(i).original)

            If P = 0 Then Exit Do

            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))

        Loop

    Next

    CleanHTML$ = text$

End Function

Sub Download (url$, outputFile$)

    url2$ = CleanHTML(url$)

    Shell _Hide "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)

End Sub

Function GetNextPage$ (currentPage$)

    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"

    SpecialPageLink$ = "<a href="

    SpecialPageEndLink$ = Chr$(34) + " title"

    Open currentPage$ For Binary As #1

    l = LOF(1)

    t$ = Space$(l)

    Get #1, 1, t$

    Close

    sp = InStr(t$, SpecialPageDivClass$)

    If sp Then

        lp = InStr(sp, t$, SpecialPageLink$)

        If lp Then

            lp = lp + 9

            lp2 = InStr(lp, t$, SpecialPageEndLink$)

            link$ = Mid$(t$, lp, lp2 - lp)

            GetNextPage$ = CleanHTML(link$)

        End If

    End If

End Function

Sub HTMLtoText (inFile$, outFile$)

    Open inFile$ For Binary As #1

    t$ = Space$(LOF(1)): Get #1, 1, t$

    Close

    start$ = "<textarea": t$ = Mid$(t$, InStr(t$, start$))

    finish$ = "</textarea>": t$ = Left$(t$, InStr(t$, finish$) - 1)

    Open outFile$ For Output As #1

    Do

        a$ = Left$(t$, 8)

        If a$ = "<script>" Then

            i = InStr(t$, "</script>")

            t$ = Mid$(t$, i + 9)

        Else

            a$ = Left$(t$, 1)

            Select Case a$

                Case " ", Chr$(10), Chr$(13): t$ = Mid$(t$, 2) 'ignore leading spaces

                Case "<": 'look for a leading <

                    i = InStr(t$, ">")

                    If i = 0 Then Print #1, CleanHTML(t$): Exit Do

                    skip$ = Left$(t$, 3)

                    Select Case skip$

                        Case "<br", "</p", "</l", "</d": Print #1, ""

                    End Select

                    t$ = Mid$(t$, i + 1) 'skip stuff in html formatting brackets

                Case Else

                    i = InStr(t$, "<")

                    If i Then

                        Print #1, CleanHTML(Left$(t$, i - 1));

                        t$ = Mid$(t$, i)

                    Else

                        Print #1, CleanHTML(t$)

                        Exit Do

                    End If

            End Select

        End If

    Loop

    Close

End Sub

I may have overlooked a few small tweaks so that this will *perfectly* match the IDE format, but I don't think those will be very hard to sort out and adjust for, with the source as it stands now. Smile

Below is basically an image of how an IDE formatted help page looks like, for those interested:

Login
Username/Email:
Password:	Lost Password?
	Remember me