==== A shell script to extract all pages from a wiki ==== Fill in the config section at the top of this shell and run it. You must have created a DokuWiki site and have access to the MediaWiki database. #!/bin/bash # dokuwiki home DW_HOME="/home/user/www/dbadocs" #Mediawiki database connection WIKIDB="user_dbawiki" WIKIPASS="xxxxxx" #where to store the extracted and converted pages DEST="mediawiki_pages" # program that does the conversion from MW to DW PHARSER="./mediawiki2dokuwiki_parser.sh" # -------------------------------------- # Thats it, nothing to change after here # -------------------------------------- if [[ ! -d ${DW_HOME}/data/pages ]]; then echo "No valid DokuWiki installation found at ${DW_HOME}, please check." exit 1 fi if [[ ! -d "${DEST}" ]]; then mkdir -p "${DEST}" if [[ $? -ne 0 ]]; then echo "Could not create directory ${DEST}. Are you sure this is where you wanted to store the MediaWiki pages?" exit 1 fi fi if [[ ! -e "${PHARSER}" ]]; then echo "The conversion program ${PHARSER} is not executable (or just not there)" fi # fetch a list of all page titles in MediaWiki titles=$(mysql --password=$WIKIPASS $WIKIDB -e 'select page_title from page;') # for each page title found in database for baretitle in $titles; do echo "Processing ${baretitle}..." # change any unusable characters cleantitle=$(echo "${baretitle}" | sed 's/\//_/g' | sed 's/[()!#]//g') lowertitle=$(echo "${cleantitle}" | tr "[:upper:]" "[:lower:]") # single quotes in baretitle must be escaped baretitle=$(echo "${baretitle}"|sed -e "s/'/\'/g") # fetch the contents of this page title and write it out to the destination directory with a '.mw' file extension mysql --password=$WIKIPASS $WIKIDB -e \ "SELECT old_text FROM revision,page,text WHERE revision.rev_id=page.page_latest AND text.old_id=revision.rev_text_id and page_title = '$baretitle';" \ | sed s/'\\n'/\\n/g | grep -v old_text > "${DEST}/${cleantitle}.mw" # process the mediawiki file and return it as a dokuwiki one putting it in the destination directory with a '.txt' file extension $PHARSER "${DEST}/${cleantitle}.mw" "${DW_HOME}/data/pages/${lowertitle}.txt" done mv "${DW_HOME}/data/pages/main_page.txt" "${DW_HOME}/data/pages/start.txt" # suspiciously short pages for f in $DEST/*; do [ $(cat $f | wc -w) -lt 25 ] && \ { echo "Deleting $f, too short"; echo rm -f $f;} done echo "" echo "Done. Put the contents of $DEST to Path_Of_dokuwiki/data/pages/" ==== A Perl script to parse the MediaWiki page ==== This is called from the shell script for each extracted page #! /bin/sh # Mediawiki2Dokuwiki Converter # originally by Johannes Buchner # License: GPL (http://www.gnu.org/licenses/gpl.txt) # Modified by Stuart Barkley to: # - ensure MediaWiki pages with a single quote in the title are treated correctly # - wrap
 and 
tags around indented MediaWiki code so that the blocks are processed correctly later. # - [[url links]] are treated differently from [[page links]] sourcefile="$1" destfile="$2" # Headings cat "$sourcefile" | \ perl -pe 's/^[ ]*=([^=])/

${1}/g' | \ perl -pe 's/([^=])=[ ]*$/${1} <\/h1>/g' | \ perl -pe 's/^[ ]*==([^=])/

${1}/g' | \ perl -pe 's/([^=])==[ ]*$/${1} <\/h2>/g' | \ perl -pe 's/^[ ]*===([^=])/

${1}/g' | \ perl -pe 's/([^=])===[ ]*$/${1} <\/h3>/g' | \ perl -pe 's/^[ ]*====([^=])/

${1}/g' | \ perl -pe 's/([^=])====[ ]*$/${1} <\/h4>/g' | \ perl -pe 's/^[ ]*=====([^=])/

${1}/g' | \ perl -pe 's/([^=])=====[ ]*$/${1} <\/h5>/g' | \ perl -pe 's/^[ ]*======([^=])/
${1}/g' | \ perl -pe 's/([^=])======[ ]*$/${1} <\/h6>/g' \ > mediawiki1 cat mediawiki1 | \ perl -pe 's/<\/?h1>/======/g' | \ perl -pe 's/<\/?h2>/=====/g' | \ perl -pe 's/<\/?h3>/====/g' | \ perl -pe 's/<\/?h4>/===/g' | \ perl -pe 's/<\/?h5>/==/g' | \ perl -pe 's/<\/?h6>/=/g' | \ cat > mediawiki2 rm -f mediawiki1 # anything that starts with a space that is outside preformatted code blocks is MediaWiki code cat mediawiki2 | perl -e ' my $in_pre_block = false; my $pre_printed = false; while (<>) { $_ =~ s/ +$//; if (m/\/) { $in_pre_block = true; } if (m/\<\/pre\>/) { $in_pre_block = false; } if (m/^\ /) { if ($in_pre_block eq true) { print; } else { if ($pre_printed eq false) { print "
\n";
                $pre_printed = true;
            }
            print;
        }
    } else {
        if ($pre_printed eq true) {
            print "
\n"; $pre_printed = false; } print; } } ' > mediawiki3 rm -f mediawiki2 # lists cat mediawiki3 | perl -pe 's/^[\*#]{4}\*/ * /g' | \ perl -pe 's/^[\*#]{3}\*/ * /g' | \ perl -pe 's/^[\*#]{2}\*/ * /g' | \ perl -pe 's/^[\*#]{1}\*/ * /g' | \ perl -pe 's/^\*/ * /g' | \ perl -pe 's/^[\*#]{4}#/ \- /g' | \ perl -pe 's/^[\*\#]{3}\#/ \- /g' | \ perl -pe 's/^[\*\#]{2}\#/ \- /g' | \ perl -pe 's/^[\*\#]{1}\#/ \- /g' | \ perl -pe 's/^\#/ - /g' | \ cat > mediawiki4 rm -f mediawiki3 #[link] => [[link]] cat mediawiki4 | perl -pe 's/([^\[])\[([^\[])/${1}[[${2}/g' | perl -pe 's/^\[([^\[])/[[${1}/g' | perl -pe 's/([^\]])\]([^\]])/${1}]]${2}/g' | perl -pe 's/([^\]])\]$/${1}]]/g' \ > mediawiki5 rm -f mediawiki4 #[[url text]] => [[url|text]] cat mediawiki5 | perl -pe 's/(\[\[http[^| \]]*) ([^|\]]*\]\])/${1}|${2}/g' \ > mediawiki6 rm -f mediawiki5 # bold, italic cat mediawiki6 | perl -pe "s/'''/**/g" | perl -pe "s/''/\/\//g" \ > mediawiki7 rm -f mediawiki6 # talks cat mediawiki7 | perl -pe "s/^[ ]*:/>/g" | perl -pe "s/>:/>>/g" | perl -pe "s/>>:/>>>/g" | perl -pe "s/>>>:/>>>>/g" | perl -pe "s/>>>>:/>>>>>/g" | perl -pe "s/>>>>>:/>>>>>>/g" | perl -pe "s/>>>>>>:/>>>>>>>/g" \ > mediawiki8 rm -f mediawiki7 # preformatted code blocks cat mediawiki8| perl -pe "s/
//g" |
    perl -pe "s/<\/pre>/<\/code>/g" \
    > mediawiki9
    rm -f mediawiki8

cat mediawiki9 > "$destfile"
rm -f mediawiki9