User Tools

Site Tools


convert_mediawiki_to_dokuwiki

A shell script to extract all pages from a wiki

Fill in the config section at the top of this shell and run it. You must have created a DokuWiki site and have access to the MediaWiki database.

#!/bin/bash

# dokuwiki home
DW_HOME="/home/user/www/dbadocs"

#Mediawiki database connection
WIKIDB="user_dbawiki"
WIKIPASS="xxxxxx"

#where to store the extracted and converted pages
DEST="mediawiki_pages"

# program that does the conversion from MW to DW
PHARSER="./mediawiki2dokuwiki_parser.sh"

# --------------------------------------
# Thats it, nothing to change after here
# --------------------------------------

if [[ ! -d ${DW_HOME}/data/pages ]]; then
    echo "No valid DokuWiki installation found at ${DW_HOME}, please check."
    exit 1
fi

if [[ ! -d "${DEST}" ]]; then
    mkdir -p "${DEST}"
    if [[ $? -ne 0 ]]; then
        echo "Could not create directory ${DEST}. Are you sure this is where you wanted to store the MediaWiki pages?"
        exit 1
    fi
fi

if [[ ! -e "${PHARSER}" ]]; then
    echo "The conversion program ${PHARSER} is not executable (or just not there)"
fi

# fetch a list of all page titles in MediaWiki
titles=$(mysql --password=$WIKIPASS $WIKIDB -e 'select page_title from page;')

# for each page title found in database
for baretitle in $titles; do

    echo "Processing ${baretitle}..."
    # change any unusable characters
    cleantitle=$(echo "${baretitle}" | sed 's/\//_/g' | sed 's/[()!#]//g')
    lowertitle=$(echo "${cleantitle}" | tr "[:upper:]" "[:lower:]")

    # single quotes in baretitle must be escaped
    baretitle=$(echo "${baretitle}"|sed -e "s/'/\'/g")
    # fetch the contents of this page title and write it out to the destination directory with a '.mw' file extension
    mysql --password=$WIKIPASS $WIKIDB -e \
    "SELECT old_text FROM revision,page,text WHERE revision.rev_id=page.page_latest AND text.old_id=revision.rev_text_id and page_title = '$baretitle';" \
    | sed s/'\\n'/\\n/g | grep -v old_text > "${DEST}/${cleantitle}.mw"

    # process the mediawiki file and return it as a dokuwiki one putting it in the destination directory with a '.txt' file extension
    $PHARSER "${DEST}/${cleantitle}.mw" "${DW_HOME}/data/pages/${lowertitle}.txt"

done
mv "${DW_HOME}/data/pages/main_page.txt" "${DW_HOME}/data/pages/start.txt"

# suspiciously short pages
for f in $DEST/*; do
	[ $(cat $f | wc -w) -lt 25 ] && \
	{ echo "Deleting $f, too short"; echo rm -f $f;}
done



echo ""
echo "Done. Put the contents of $DEST to Path_Of_dokuwiki/data/pages/"

A Perl script to parse the MediaWiki page

This is called from the shell script for each extracted page

#! /bin/sh
# Mediawiki2Dokuwiki Converter
# originally by Johannes Buchner <buchner.johannes [at] gmx.at>
# License: GPL (http://www.gnu.org/licenses/gpl.txt)
# Modified by Stuart Barkley to:
# - ensure MediaWiki pages with a single quote in the title are treated correctly
# - wrap <pre> and </pre> tags around indented MediaWiki code so that the blocks are processed correctly later.
# - [[url links]] are treated differently from [[page links]]

sourcefile="$1"
destfile="$2"

# Headings
cat "$sourcefile" | \
   perl -pe 's/^[ ]*=([^=])/<h1> ${1}/g' | \
   perl -pe 's/([^=])=[ ]*$/${1} <\/h1>/g' | \
   perl -pe 's/^[ ]*==([^=])/<h2> ${1}/g' | \
   perl -pe 's/([^=])==[ ]*$/${1} <\/h2>/g' | \
   perl -pe 's/^[ ]*===([^=])/<h3> ${1}/g' | \
   perl -pe 's/([^=])===[ ]*$/${1} <\/h3>/g' | \
   perl -pe 's/^[ ]*====([^=])/<h4> ${1}/g' | \
   perl -pe 's/([^=])====[ ]*$/${1} <\/h4>/g' | \
   perl -pe 's/^[ ]*=====([^=])/<h5> ${1}/g' | \
   perl -pe 's/([^=])=====[ ]*$/${1} <\/h5>/g' | \
   perl -pe 's/^[ ]*======([^=])/<h6> ${1}/g' | \
   perl -pe 's/([^=])======[ ]*$/${1} <\/h6>/g' \
    > mediawiki1

cat mediawiki1 | \
   perl -pe 's/<\/?h1>/======/g' | \
   perl -pe 's/<\/?h2>/=====/g' | \
   perl -pe 's/<\/?h3>/====/g' | \
   perl -pe 's/<\/?h4>/===/g' | \
   perl -pe 's/<\/?h5>/==/g' | \
   perl -pe 's/<\/?h6>/=/g' | \
   cat > mediawiki2
   rm -f mediawiki1

# anything that starts with a space that is outside preformatted code blocks is MediaWiki code
cat mediawiki2 |
perl -e '
my $in_pre_block = false;
my $pre_printed = false;
while (<>) {
    $_ =~ s/ +$//;
    if (m/\<pre\>/) {
        $in_pre_block = true;
    }
    if (m/\<\/pre\>/) {
        $in_pre_block = false;
    }
    if (m/^\ /) {
        if ($in_pre_block eq true) {
            print;
        } else {
            if ($pre_printed eq false) {
                print "<pre>\n";
                $pre_printed = true;
            }
            print;
        }
    } else {
        if ($pre_printed eq true) {
            print "</pre>\n";
            $pre_printed = false;
        }
        print;
    }
}
'  > mediawiki3
   rm -f mediawiki2

# lists
cat mediawiki3 |
    perl -pe 's/^[\*#]{4}\*/          * /g'  | \
    perl -pe 's/^[\*#]{3}\*/        * /g'    | \
    perl -pe 's/^[\*#]{2}\*/      * /g'      | \
    perl -pe 's/^[\*#]{1}\*/    * /g'        | \
    perl -pe 's/^\*/  * /g'                  | \
    perl -pe 's/^[\*#]{4}#/          \- /g'  | \
    perl -pe 's/^[\*\#]{3}\#/      \- /g'    | \
    perl -pe 's/^[\*\#]{2}\#/    \- /g'      | \
    perl -pe 's/^[\*\#]{1}\#/  \- /g'        | \
    perl -pe 's/^\#/  - /g'                  | \
    cat  > mediawiki4
    rm -f mediawiki3

#[link] => [[link]]
cat mediawiki4 |
   perl -pe 's/([^\[])\[([^\[])/${1}[[${2}/g' |
   perl -pe 's/^\[([^\[])/[[${1}/g' |
   perl -pe 's/([^\]])\]([^\]])/${1}]]${2}/g' |
   perl -pe 's/([^\]])\]$/${1}]]/g' \
   > mediawiki5
   rm -f mediawiki4

#[[url text]] => [[url|text]]
cat mediawiki5 |
   perl -pe 's/(\[\[http[^| \]]*) ([^|\]]*\]\])/${1}|${2}/g' \
   > mediawiki6
   rm -f mediawiki5

# bold, italic
cat mediawiki6 |
   perl -pe "s/'''/**/g" |
   perl -pe "s/''/\/\//g" \
   > mediawiki7
   rm -f mediawiki6

# talks
cat mediawiki7 |
   perl -pe "s/^[ ]*:/>/g" |
   perl -pe "s/>:/>>/g" |
   perl -pe "s/>>:/>>>/g" |
   perl -pe "s/>>>:/>>>>/g" |
   perl -pe "s/>>>>:/>>>>>/g" |
   perl -pe "s/>>>>>:/>>>>>>/g" |
   perl -pe "s/>>>>>>:/>>>>>>>/g" \
   > mediawiki8
   rm -f mediawiki7

# preformatted code blocks
cat mediawiki8|
    perl -pe "s/<pre>/<code>/g" |
    perl -pe "s/<\/pre>/<\/code>/g" \
    > mediawiki9
    rm -f mediawiki8

cat mediawiki9 > "$destfile"
rm -f mediawiki9
convert_mediawiki_to_dokuwiki.txt · Last modified: 2019/01/30 11:32 by 127.0.0.1

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki