Wiki TEXT pages downloader
#5
Stage Two: We now get all the wiki page names from our two files successfully.

Code: (Select All)
$Console:Only
DefLng A-Z
Const HomePage$ = "https://qb64phoenix.com"
ReDim Shared PageNames(10000) As String
NumberOfPageLists = DownloadPageLists 'As of mid 2022, there are only 2 pages listing all the page names.
'NumberOfPageLists = 2  'hard coded for counting without having to download pages repeatedly while testing code

PageCount = CountPages(NumberOfPageLists)
Print PageCount
For i = 1 To PageCount
    Print i; "of"; PageCount, PageNames(i)
    Sleep
Next


Function CountPages (NumberOfPageLists)
    FileLeft$ = "Page List("
    FileRight$ = ").txt"
    For i = 1 To NumberOfPageLists
        file$ = FileLeft$ + _Trim$(Str$(i)) + FileRight$
        Open file$ For Binary As #1
        l = LOF(1): t$ = Space$(l)
        Get #1, 1, t$
        Close #1
        start = InStr(t$, "<ul") 'skip down to the part of the page with the page listins
        finish = InStr(start, t$, "</ul") 'and we can quit parsing when we get down to this point
        p = start 'current position in file we're parsing
        Do Until p > finish
            p = InStr(p, t$, "<li><a href=") + 13
            If p = 13 Then Exit Do 'we've parsed all the lists from the page.  No need to keep going
            p2 = InStr(p, t$, Chr$(34))
            count = count + 1
            PageNames(count) = Mid$(t$, p, p2 - p)
        Loop
    Next
    CountPages = count
    ReDim _Preserve PageNames(count) As String
End Function


Function DownloadPageLists
    FileLeft$ = "Page List("
    FileRight$ = ").txt"
    FileCount = 1
    CurrentFile$ = ""
    url$ = "/qb64wiki/index.php/Special:AllPages" 'the first file that we download
    Do
        file$ = FileLeft$ + _Trim$(Str$(FileCount)) + FileRight$
        Download url$, file$
        url2$ = GetNextPage$(file$)
        P = InStr(url2$, "from=")
        If P = 0 Then Exit Do
        If Mid$(url2$, P + 5) > CurrentFile$ Then
            CurrentFile$ = Mid$(url2$, P + 5)
            FileCount = FileCount + 1
            url$ = url2$
        Else
            Exit Do
        End If
    Loop
    DownloadPageLists = FileCount
End Function

Function CleanHTML$ (OriginalText$)
    text$ = OriginalText$ 'don't corrupt incoming text
    Type ReplaceList
        original As String
        replacement As String
    End Type

    'Expandable HTML replacement system
    Dim HTML(1) As ReplaceList
    HTML(0).original = "&amp;": HTML(0).replacement = "&"
    HTML(1).original = "%24": HTML(1).replacement = "$"

    For i = 0 To UBound(HTML)
        Do
            P = InStr(text$, HTML(i).original)
            If P = 0 Then Exit Do
            text$ = Left$(text$, P - 1) + HTML(i).replacement + Mid$(text$, P + Len(HTML(i).original))
        Loop
    Next
    CleanHTML$ = text$
End Function

Sub Download (url$, outputFile$)
    url2$ = CleanHTML(url$)
    'Print "https://qb64phoenix.com/qb64wiki/index.php?title=Special:AllPages&from=KEY+n"
    'Print HomePage$ + url2$
    Shell "curl -o " + Chr$(34) + outputFile$ + Chr$(34) + " " + Chr$(34) + HomePage$ + url2$ + Chr$(34)
End Sub

Function GetNextPage$ (currentPage$)
    SpecialPageDivClass$ = "<div class=" + Chr$(34) + "mw-allpages-nav" + Chr$(34) + ">"
    SpecialPageLink$ = "<a href="
    SpecialPageEndLink$ = Chr$(34) + " title"
    Open currentPage$ For Binary As #1
    l = LOF(1)
    t$ = Space$(l)
    Get #1, 1, t$
    Close
    sp = InStr(t$, SpecialPageDivClass$)
    If sp Then
        lp = InStr(sp, t$, SpecialPageLink$)
        If lp Then
            lp = lp + 9
            lp2 = InStr(lp, t$, SpecialPageEndLink$)
            link$ = Mid$(t$, lp, lp2 - lp)
            GetNextPage$ = CleanHTML(link$)
        End If
    End If
End Function
Reply


Messages In This Thread
Wiki TEXT pages downloader - by SMcNeill - 05-23-2022, 04:15 AM
RE: Wiki TEXT pages downloader - by RhoSigma - 05-23-2022, 08:54 AM
RE: Wiki TEXT pages downloader - by Coolman - 05-23-2022, 09:37 AM
RE: Wiki TEXT pages downloader - by SMcNeill - 05-23-2022, 08:45 PM
RE: Wiki TEXT pages downloader - by SMcNeill - 05-24-2022, 09:50 AM



Users browsing this thread: 4 Guest(s)