JargonWiki:Jargon2Wiki.php

From JargonWiki

Jump to: navigation, search


GNU General Public License

This article is subject to the GNU General Public License version 3.0.
No user contributions are accepted to pages licensed under the GNU GPL.
For more information see the standard editing privileges.

Image:GNU_sm.png


Note: SVN access, most recent versions, documentation, version history, and more are available at the Google Code site for this project:

<?php
# @title PHP Jargon File to MediaWiki Converter
# @version 0.2
# @author Charles File
# @link http://jargonwiki.com/
#
# This script is designed to accept input in the format used by the plaintext version of
# The Jargon File and output to either screen or local disk that same content in a format:
# a) Readable and Useful to MediaWiki
# b) Usable by BulkPageCreator.php, which actually submits the articles to the wiki.
#
# If you're interested, and/or what to try out this script, you can get your hands on a
# copy of a text version of the Jargon File at:
# http://catb.org/jargon/oldversions/
#
# As of this writing, the latest version is:
# http://catb.org/jargon/oldversions/jarg447.txt
#
#  Syntax:
#  
#  Invoked via a Command Line Interface (CLI):
#  > php Jargon2Wiki.php input [mode] [outputfile] [debug flag]
# for instance:
#  > php Jargon2Wiki.php inputfile.txt file outputfile.txt 1
#  
#  - or -
#  
#  Invoked via a Uniform Resource Identifier (URI):
#  http://../Jargon2Wiki.php?file=inputfile.txt(&mode=file&out=outputfile.txt&debug=1)
#
# If not passed in the argument, mode defaults to screen, output file defaults to output.txt,
# and debug defaults to off.
#  
# Note: This parser is meant to be run in a controlled, local environment,
# to prepare and wiki-ify content before it is submitted to a MediaWiki.
# Deployment to open access is completely untested and unsupported.
# If you are looking to dynamically parse and submit content of unknown
# trustworthiness to your Wiki, I suggest you use something else.
#
# Known issues:
# This whole thing could probably be a bit more efficient by combining many of the string replacement
# functions. But it won't change much big-O wise, and since its only a parser meant to be run rarely,
# I'm not very concerned.
# Lots of general clean-up nescessary.
# Using globals is hack-y and potentially a security risk.
# Hell, this thing is full of potential security risks.
# Error handling and reporting is rudimentary, at best.
#
# Resolved Issues as of v0.2:
# See All: Still not parsing correctly.
# Some articles are of the form :word: 1. n. Noun definition. 2. v. Verb definition. Grrr....
# Ordered lists behave badly.
# Need to parse and wiki-ify image links.
#


# Some variables that might need changing:
$strCRLF = "\r\n";
$strJargonFileVersion = "4.4.7";
$boolDebug = 0; //1 to enable debug mode. [obv. I hope].

# @param string strAbbrev An abbreviation string.
# @return string What the passed abbreviation string stands for.
function getAbbreviationValue($strParamAbbreviation) {
	$strReturn = "";
	if ($strParamAbbreviation == "abbrev.") {
		$strReturn = "Abbreviation";
	} else if ($strParamAbbreviation == "adj.") {
		$strReturn = "Adjective";
	} else if ($strParamAbbreviation == "adv.") {
		$strReturn = "Adverb";
	} else if ($strParamAbbreviation == "alt.") {
		$strReturn = "Alternate";
	} else if ($strParamAbbreviation == "cav.") {
		$strReturn = "Caveat";
	} else if ($strParamAbbreviation == "conj.") {
		$strReturn = "Conjunction";
	} else if ($strParamAbbreviation == "esp.") {
		$strReturn = "Especially";
	} else if ($strParamAbbreviation == "excl.") {
		$strReturn = "Exclamation";
	} else if ($strParamAbbreviation == "imp.") {
		$strReturn = "Imperative";
	} else if ($strParamAbbreviation == "interj.") {
		$strReturn = "Interjection";
	} else if ($strParamAbbreviation == "n.") {
		$strReturn = "Noun";
	} else if ($strParamAbbreviation == "obs.") {
		$strReturn = "Obsolete";
	} else if ($strParamAbbreviation == "pl.") {
		$strReturn = "Plural";
	} else if ($strParamAbbreviation == "poss.") {
		$strReturn = "Possibly";
	} else if ($strParamAbbreviation == "pref.") {
		$strReturn = "Prefix";
	} else if ($strParamAbbreviation == "prob.") {
		$strReturn = "Probably";
	} else if ($strParamAbbreviation == "prov.") {
		$strReturn = "Proverbial";
	} else if ($strParamAbbreviation == "quant.") {
		$strReturn = "Quantifier";
	} else if ($strParamAbbreviation == "suff.") {
		$strReturn = "Suffix";
	} else if ($strParamAbbreviation == "v.") {
		$strReturn = "Verb";
	} else if ($strParamAbbreviation == "var.") {
		$strReturn = "Variant";
	} else if ($strParamAbbreviation == "vi.") {
		$strReturn = "Intransitive Verb";
	} else if ($strParamAbbreviation == "vt.") {
		$strReturn = "Transitive Verb";
	} //if..else
	return $strReturn;
} //function

# @param string strParamDerivation A string containing information on an article's origins.
# @return string The detected origin.
function getEtymology($strParamDerivation) {
	$strEtymology = '';
	if (eregi("unix", $strParamDerivation)) {
		$strEtymology = "Unix";
	} else if (eregi("irc", $strParamDerivation)) {
		$strEtymology = "IRC";
	} else if (eregi("(usenet|newsgroup)", $strParamDerivation)) {
		$strEtymology = "Usenet";
	} else if (eregi("(email|@)", $strParamDerivation)) {
		$strEtymology = "Email";
	} else if (eregi("(http|www|web)", $strParamDerivation)) {
		$strEtymology = "World Wide Web";
	} else if (eregi("ascii", $strParamDerivation)) {
		$strEtymology = "ASCII";
	} else if (eregi("ansi", $strParamDerivation)) {
		$strEtymology = "ANSI";
	} else if (eregi("greek", $strParamDerivation)) {
		$strEtymology = "Greek";
	} else if (eregi("french", $strParamDerivation)) {
		$strEtymology = "French";
	} else if (eregi("german", $strParamDerivation)) {
		$strEtymology = "German";
	} else if (eregi("spanish", $strParamDerivation)) {
		$strEtymology = "Spanish";
	} else if (eregi("latin", $strParamDerivation)) {
		$strEtymology = "Latin";
	} else if (eregi("japanese", $strParamDerivation)) {
		$strEtymology = "Japanese";
	} else if (eregi("chinese", $strParamDerivation)) {
		$strEtymology = "Chinese";
	} else if (eregi("indian", $strParamDerivation)) {
		$strEtymology = "Indian";
	} else if (eregi("ftp", $strParamDerivation)) {
		$strEtymology = "FTP";
	} //if..else
return $strEtymology;
} //function

# @param array arrParam A one-dimensional array.
# @return string A listing of that array's keys and values on a single line.
function describeArray($arrParam) {
	$strReturn = '';
	if (is_array($arrParam)) {
		foreach ($arrParam as $strKeyParam => $strValueParam) {
			$strReturn .= "[" . $strKeyParam . "] => " . $strValueParam . " ";
		} //foreach
	} //if
return $strReturn;
} //function

# @param string strParamArticle An article in Jargon File format to be parsed into MediaWiki format.
# @return string The argument string in MediaWiki format.
function parseArticle($strParamArticle) {
	# Get some globals we'll need.
	global $strCRLF;
	global $strJargonFileVersion;
	global $boolDebug;

	# Also get rid of any CRLF's or leading ":"'s at the start of the article, 
	# so that after the split() all $arrArticle values start similarly, without a leading ":".
	while (substr($strParamArticle, 0, 1) == $strCRLF || substr($strParamArticle, 0, 1) == ":") {
		$strParamArticle = substr($strParamArticle, 1);
	} //while

	# Find string positions of characters that appear immediately after data that we want to pull from the article.
	# We'll use these string positions as indexes in order to pull substrings that contain article data we want.
	$intIndexEndTitle = strpos($strParamArticle, ":");
	$intIndexEndFirstLine = strpos($strParamArticle, $strCRLF);
	
	# Split current article (in $strParamArticle) into Title, Info, and Body.
	# Title is substring of article string starting at beginning of article and ending at first ":".
	$strTitle = substr($strParamArticle, 0, $intIndexEndTitle);
	# Info is substring of article string starting at first ":" and ending at first new line character.
	$strInfo = substr($strParamArticle, $intIndexEndTitle + 1, $intIndexEndFirstLine - $intIndexEndTitle);
	$strInfo = trim($strInfo);
	$strInfo = preg_replace("/\s+/", ' ', $strInfo);
	# Body is substring of article string starting at first new line character and ending where the article ends.
	$strBody = substr($strParamArticle, $intIndexEndFirstLine);
	
	# Split Info on " " in order to seperate the words.
	$arrInfo = explode(" ", $strInfo);
	# Step through the words in arrInfo, in order to organize them into Pronunciation, Abbreviation, and Other.
	$arrPronunciation = array();
	$arrAbbreviation = array();
	$arrOther = array();
	foreach ($arrInfo as $strKeyInfo => $strValueInfo) {
		$strValueInfo = trim(str_replace(',', '', $strValueInfo));
		# arrPronunciation will contain words containing "/" character(s).
		if (!(strpos($strValueInfo, "/") === FALSE)) {
			array_push($arrPronunciation, $strValueInfo);
		# arrAbbreviation will contain words containing "." character(s).
		} else if (!(strpos($strValueInfo, ".") === FALSE)) {
			array_push($arrAbbreviation, $strValueInfo);
		# Other will contain everything else.
		} else {
			array_push($arrOther, $strValueInfo);
		} //if..else
	} //foreach
	if (count($arrPronunciation) > 0) {
		if ($arrPronunciation[0] == "//") {
			$arrPronunciation[0] = "";
		} //if
	} //if
	
	# Some articles are of the form :word: 1. n. Noun definition. 2. v. Verb definition. Grrr....
	# So now let's search the whole damn thing just to double-check.
	if (!(count($arrAbbreviation) > 0)) {
		$arrAbbreviationCatcher = array();
		preg_match_all("/\s((abbrev)|(adj)|(adv)|(alt)|(cav)|(conj)|(esp)|(excl)|(imp)|(interj)|(n)|(obs)|(pl)|(poss)|(pref)|(prob)|(prov)|(quant)|(suff)|(v)|(var)|(vi)|(vt))\.(\s|\,)/", $strBody, $arrAbbreviationCatcher, PREG_PATTERN_ORDER);
		foreach ($arrAbbreviationCatcher[0] as $strKeyAbbreviationCatcher => $strValueAbbreviationCatcher) {
			array_push($arrAbbreviation, $strValueAbbreviationCatcher);
		} //foreach
	} //if

	# Find and wiki-ify image references.
	$strBody = preg_replace("/\[\s*(\S+\.((png)|(gif)|(jpg)|(jpeg)))\s*\]/", "[[Image:$1]]", $strBody);
	
	# Wiki format body text.
	# Remove two line breaks at the start of article bodies.
	$strBody = preg_replace("/^". $strCRLF . $strCRLF ."\s+/", "", $strBody);
	# Remove line breaks between lines within paragraphs,
	# but retain two line breaks between paragraphs.
	$strBody = preg_replace("/". $strCRLF . $strCRLF ."/", "--PARAGRAPH BREAK--", $strBody);
	$strBody = preg_replace("/\s+/", " ", $strBody);
	$strBody = str_replace("--PARAGRAPH BREAK--", $strCRLF . $strCRLF, $strBody);
	# Remove white space at the start of paragraphs.
	$strBody = str_replace($strCRLF ." ", $strCRLF, $strBody);
	
	# Find See Also: entries in the page.
	# First, find phrases that contain:
	# See (Also [optional])(: [optional]) {[some word]} ({[optional additional word]}) ({[ditto]}) (...)
preg_match_all("/(((S|s)ee)|((C|c)ompare)|((O|o)ppose)|((A|a)lso))(\s+(A|a)lso)?:?\s+\{[^\}]+\}([^\.\)]*\{[^\}]+\})*(\.|\))/", $strBody, $arrMatchPhraseSeeAlso, PREG_SET_ORDER);
	$arrSeeAlso = array();
	# Now step through these phrases, and pull out just the substrings that appear between "{" and "}".
	# These are the actual article names that should be linked to in a "See Also" section of the Wiki article.
	foreach ($arrMatchPhraseSeeAlso as $strKeyPhraseSeeAlso => $arrValuePhraseSeeAlso) {
		preg_match_all("/\{[^\}]+\}/", $arrValuePhraseSeeAlso[0], $arrMatchSeeAlso, PREG_PATTERN_ORDER);
		# Next pull the "See Also" substrings out of the messy array of matches, format for Wiki,
		# and push them onto a clean new array for later access.
		foreach($arrMatchSeeAlso[0] as $strKeyMatchSeeAlso => $strValueMatchSeeAlso) {
			$strValueMatchSeeAlso = str_replace("{", "[[", str_replace("}", "]]", $strValueMatchSeeAlso));
			array_push($arrSeeAlso, $strValueMatchSeeAlso);
		} //foreach
	} //foreach
	# Dedupe!
	$arrSeeAlso = array_unique($arrSeeAlso);
	
	# Find Derivation information in the page.
	# Do this by searching for a string in brackets at the start of the body.
	# Note: The text inside the brackets should not begin with "/(A|a)lt(.|ernate|ernative) /".
	# Which we will assume is "Alternate" text, containing entries that have similar definitions,
	# or alternate spellings/renderings/dialectic interpretations of the entry in question.
	# (More on Alternate text later.)
	preg_match("/\[[^\]](?![Aa]lt(\.|ernat(iv)?e)?:?\s+)[^\]]+\][^\]]/", $strBody, $arrMatchDerivation);
	$strDerivation = '';
	if (count($arrMatchDerivation) > 0) {
		$strDerivation = $arrMatchDerivation[0];
		$strDerivation = str_replace("[", "", str_replace("]", "", $strDerivation));
		# Capitalize.
		$strDerivation = strtoupper(substr($strDerivation, 0, 1)) . substr($strDerivation, 1);
		# Add a full stop to the end (if necessary).	
		if ($strDerivation != "" && substr($strDerivation, -1, 1) != ".") {
			$strDerivation .= ".";
		} //if
	} //if
	
	# Note: To include parentheses as a delimitting criterion in this silliness,
	# inculcate the following lines with the previous, where appropriate.
//	preg_match("/(\[|\()[^\]\)]+(\]|\))/", $strBody, $arrMatchDerivation);
//	$strDerivation = str_replace("(", "", str_replace(")", "", $strDerivation));
	
	# Get Etymology info from Derivation info.
	$strEtymology = getEtymology($strDerivation);

	
	# Find Alternate term or spelling information in the page.
	# Search the body for a string that's in parentheses or brackets that starts with "/(A|a)lt(.|ernate|ernative) /".
	# Note the white space character terminating the match string at the end of the previous line;
	# Its there so we avoid matching Usenet group names (e.g. alt.arts.poetry.comments, etc.)
	preg_match("/(\(|\[)(A|a)lt(\.|ernat(e|ive))?:?\s[^\)\]]+(\)|\])/", $strBody, $arrMatchAlternate);
	$strAlternate = '';
	if (count($arrMatchAlternate) > 0) {
		$strAlternate = $arrMatchAlternate[0];
		# Remove "(A|a)lt(\.|ernate|ernative)?:?\s"
		$strAlternate = preg_replace("/(A|a)lt(\.|ernat(e|ive))?:?\s/", "", $strAlternate);
		$strAlternate = str_replace("(", "", str_replace(")", "", $strAlternate));
		$strAlternate = str_replace("[", "", str_replace("]", "", $strAlternate));
		$strAlternate = preg_replace("/" . $strCRLF . "/", "", $strAlternate);
	} //if
	
	# Wiki format ordered lists.
	# Looking for (a newline, space?, then a number, OR string start, space?, followed by a "1")
	# [need this or we match numbers at the ends of sentences]
	# then a period then some space.
	# ASSUMPTION: All ordered lists start with "1".
	# Deemed safe enough for government work. Any definition that doesn't is missing the most important, anyway!
	$strBody = preg_replace(array("/((" . $strCRLF . "\s*1)|(^\s*1))\.\s+/", "/" . $strCRLF . "\s*[0-9]+\.\s+/"), array("# ", $strCRLF . "# "), $strBody);
	# Wiki format links
	$strBody = str_replace("{", "[[", str_replace("}", "]]", $strBody));
	
	# Create link to the Jargon File [This isn't quite foolproof].
	$strJargonFileLink = "http://catb.org/jargon/html/";
	# If article title starts with a letter, use that letter for the directory that the article file appears in.
	$strTitleFirstChar = substr($strTitle, 0, 1);
	if (preg_match("/[a-zA-Z]/", $strTitleFirstChar)) {
		$strJargonFileDir = strtoupper($strTitleFirstChar);
	# Otherwise, if it starts with a number or non-alphanumeric character,
	# use "0" for the directory that the article file appears in.
	} else {
		$strJargonFileDir = "0";
	} //if..else
	$strJargonFileLink .= $strJargonFileDir . "/";
	# Replace spaces and slashes in the article title with hyphens,
	# and use that as working Jargon File article file name.
	$strJargonFileLinkTitle = str_replace("@", "at", str_replace("\\", "-", str_replace("/", "-", str_replace(" ", "-", $strTitle))));
	# If there's a bang at the end, make it a hyphen.
	$strJargonFileLinkTitle = preg_replace("/\!$/", "-", $strJargonFileLinkTitle);
	# Now remove non-URL characters.
	$strJargonFileLinkTitle = preg_replace("/[^a-zA-Z0-9\?\&\_\-]/", "", $strJargonFileLinkTitle);
	# Cut off non alpha-numeric characters from the front of the filename.
	while (preg_match("/^[^a-zA-Z0-9]/", $strJargonFileLinkTitle)) {
		$strJargonFileLinkTitle = substr($strJargonFileLinkTitle, 1);
	} //if


	$strJargonFileLink .= $strJargonFileLinkTitle;
	$strJargonFileLink .= ".html";
	

	
	
	# Create Wiki article in the format usable by the batch upload script "BulkPageCreator.php".
	$strArticle = str_replace(":", "", $strTitle);
	$strArticle .= $strCRLF;
	# Add formatting information for "BulkPageCreator.php".
	$strArticle .= "--ENDTITLE--";
	$strArticle .= $strCRLF;
	# Add the "This comes from The Jargon File" template.
	$strArticle .= "{{Jargon File|" . $strJargonFileVersion . "}}";
	$strArticle .= $strCRLF;
	# Add the "Jargon Information Summary" template.
	$strArticle .= "{{Jargon|Term=" . $strTitle . "|";
	# Add pronunciation information to the template call in the format:
	# Pronunciation=/foo/|Pronunciation-2=/fu/|Pronunciation-3=/fuh/
	$intCount = 1;
	foreach ($arrPronunciation as $strPronounceKey => $strPronounceValue) {
		$strArticle .= "Pronunciation";
		if ($intCount > 1) {
			$strArticle .= "-" . $intCount;
		} //if
		$strArticle .= "=" . $strPronounceValue . "|";
		$intCount++;
	} //foreach
	# Add usage information to the template call in the format:
	# Usage=[[Grammar:Noun|n.]] [[Grammar:Obsolete|obs.]] ... [[Grammar:Adjective|adj.]]
	$strArticle .= "Usage=";
	$strUsage = "";
	foreach ($arrAbbreviation as $strAbbrevKey => $strAbbrevValue) {	
		$strUsage .= " [[Grammar:" . getAbbreviationValue($strAbbrevValue) . "|" . $strAbbrevValue . "]]";
	} //foreach
	$strUsage = substr($strUsage, 1); //remove intial space
	$strArticle .= $strUsage;
	$strArticle .= "|";
	# Add etymology information to the template call in the format:
	# Etymology=[[Etymology:Unix|Unix]]
	$strArticle .= "Etymology=";
	if ($strEtymology != "") {
		$strArticle .= "[[Etymology:" . $strEtymology . "|" . $strEtymology . "]]";
	} //if
	$strArticle .= "|";
	# Add derivation information to the template call in the format:
	# Derivation=This comes from Unix
	$strArticle .= "Derivation=" . str_replace("{", "[[", str_replace("}", "]]", $strDerivation)) . "|";
	# Add alternate term/spelling information to the template call in the format (for example, could be anything):
	# Alternate=Foobar /`fu bar/
	$strArticle .= "Alternate=" . str_replace("{", "[[", str_replace("}", "]]", $strAlternate)) . "|";
	# Add see also information to the template call in the format:
	# Also=[[Foo]], [[Bar]], ..., [[Baz]]
	$strArticle .= "Also=";
	$strAlso = "";
	foreach ($arrSeeAlso as $strAlsoKey => $strAlsoValue) {
		$strAlso .= str_replace("{", "[[", str_replace("}", "]]", $strAlsoValue));
		$intCount++;
	} //foreach
	$strAlso = str_replace("]][[", "]], [[", $strAlso);
	$strArticle .= $strAlso;
	$strArticle .= "}}";
	$strArticle .= $strCRLF;	
	$strArticle .= $strCRLF;
	# Add article text.
	# First add title in bold.
	$strArticle .= "'''" . $strTitle . "''':";
	# Next add pronunciations.
	foreach ($arrPronunciation as $strPronounceKey => $strPronounceValue) {
		$strArticle .= " " . $strPronounceValue;
	} //foreach
	# Finally, add abbreviations.
	foreach ($arrAbbreviation as $strAbbrevKey => $strAbbrevValue) {
		$strArticle .= " [[Grammar:" . getAbbreviationValue($strAbbrevValue) . "|" . $strAbbrevValue . "]]";
	} //foreach
	$strArticle .= $strCRLF;
	$strArticle .= $strCRLF;
	# Add article body.	
	$strArticle .= $strBody;
	$strArticle .= $strCRLF;
	$strArticle .= $strCRLF;
	# Add "Sources" section.
	$strArticle .= "== Sources ==";
	$strArticle .= $strCRLF;
	# Add "Source" template with a link to the JargonWiki article on The Jargon File.
	$strArticle .= "{{Source|Source:Jargon_File|The Jargon File|version " . $strJargonFileVersion . "}}";
	$strArticle .= $strCRLF;
	$strArticle .= $strCRLF;
	# Add "External links" section.
	$strArticle .= "== External links ==";
	$strArticle .= $strCRLF;
	# Add link to the page containing the same article on The Jargon File Website.
	$strArticle .= "{{External Source|" . $strJargonFileLink . "|" . $strTitle . "| in [[Source:Jargon_File|The Jargon File]].}}";
	$strArticle .= $strCRLF;
	$strArticle .= $strCRLF;
	# Add "This article is in the public domain" template.
	$strArticle .= "{{Public Domain}}";
	$strArticle .= $strCRLF;
	$strArticle .= $strCRLF;
	# Add relevant category.
	$strArticle .= "[[Category:Hacker|{{PAGENAME}}]]";
	$strArticle .= $strCRLF;
	$strArticle .= $strCRLF;
	# Add formatting information for "BulkPageCreator.php".
	$strArticle .= "--ENDPAGE--";
	$strArticle .= $strCRLF;
	# This is the end of the construction of a wiki-formatted article for output to a text file
	# and eventual upload to the MediaWiki-powered host.

	# Below: HTML output used to debug during development.
	$strDebug = '';
	if ($boolDebug) {
		$strDebug .= "<div style=\"border:1px solid black; margin:.8em; padding:.8em;\">";
		$strDebug .= "<div style=\"border:1px dashed black; margin:.8em; padding:.8em;\">";
		$strDebug .= "<h3>HTML Format</h3>";
		$strDebug .= "<p></p>" . $strCRLF;
		$strDebug .= "<p></p>" . $strCRLF;
		$strDebug .= "<p>";
		$strDebug .= "title: " . str_replace(":", "", $strTitle) . "\n";
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "etymology: ";
		$strDebug .= $strEtymology;
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "derivation: ";
		$strDebug .= $strDerivation;
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "pronunciations: ";
		$strDebug .= describeArray($arrPronunciation);
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "abbreviations: ";
		$strDebug .= describeArray($arrAbbreviation);
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "alternates: ";
		$strDebug .= $strAlternate;
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "info: ";
		$strDebug .= describeArray($arrInfo[0]);
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "see also: ";
		$strDebug .= describeArray($arrSeeAlso);
		$strDebug .= "</p><p>" . $strCRLF;
		$strDebug .= "body: " . $strBody . $strCRLF;
		$strDebug .= "</p>";
		$strDebug .= "</div>";
		
		$strDebug .= "<div style=\"border:1px dashed black; margin:.8em; padding:.8em;\">";
		$strDebug .= "<h3>Wiki Format</h3>";
		
		$strArticle = $strDebug . $strArticle;
	} //if
return $strArticle;
} //function

# @param string strParam ASCII data to be appended to a file on disc.
# @param string strParamOutputFileName The name of the file to which the data will be appended. If file does not exist, it will be created.
# @return boolean True on successful write to disc, false otherwise.
function fappend($strParam, $strParamOutputFileName) {
	$fdFileOut = fopen($strParamOutputFileName, "a");
	if ($fdFileOut) {
		fwrite($fdFileOut, $strParam, strlen($strParam));
		$boolReturn = true;
	} else {
		echo "Output file failed to open. Bad or duplicate file name.";
		$boolReturn = false;
	} //if
	fclose($fdFileOut);
	return $boolReturn;
} //function

# Find the file we will process, and read it in.
# $_GET and $SERVER['argv'] are BOTH set when the script runs, so isset() is useless.
# Further, no matter how php gets the arguments, via GET, POST, CLI, or CGI, it will
# ALWAYS store those variables in $_SERVER['args'].
# Ergo count($_SERVER['args']) is useless as well, as it will be the same result
# no matter how the variables were passed.
# However, $_GET and $_POST will ONLY be set if the arguments are passed via those methods.
# Therefore in order to test for execution mode, count()ing those arrays are the only
# things that will return meaningful information about HOW the arguments were passed.
# Also, this all means that assuming that the GET variables are passed in the same order
# that the CLI arguments are passed, then they could be retrieved via $argv, as well.
# Adding the conditional test adds flexibility and convenience, as well as the helpfulness
# of knowing if we are outputting to a web server or a command shell.
# Note, however, that this is a far from foolproof method. If no variables are passed via
# GET, then we'll bump to the else portion of the below test and be in "CLI mode," even
# though the script was invoked via HTTP rather than CLI.
if (count($_GET) > 0) {
# Via URI:
	if (array_key_exists("file", $_GET)) { $strInputFileName = $_GET["file"]; }
	if (array_key_exists("mode", $_GET)) { $strOutputMode = $_GET["mode"]; }
	if (array_key_exists("out", $_GET)) { $strOutputFileName = $_GET["out"]; }
	if (array_key_exists("debug", $_GET)) { $boolDebug = $_GET["debug"]; }
} else {
# Via CLI:
	if (array_key_exists(1, $argv)) { $strInputFileName = $argv[1]; }
	if (array_key_exists(2, $argv)) { $strOutputMode = $argv[2]; }
	if (array_key_exists(3, $argv)) { $strOutputFileName = $argv[3]; }
	if (array_key_exists(4, $argv)) { $boolDebug = $argv[4]; }
} //if..else	


# End initialization. Begin processing, provided we have something to process.
if (isset($strInputFileName) && file_exists($strInputFileName)) {
	$strArticleBlockRemainder = '';
	$intArticleCount = 0;
	$arrArticle = array();
	$intInputFileSize = filesize($strInputFileName);
	if (isset($strOuputFileName)) {
		if ($strOutputMode == "file" && $strOutputFileName == "") {
			$strOutputFileName = "output.txt";
		} //if
	} else {
		$strOutputFileName = "output.txt";
	} //if..else
	
	# Open Source File and Read.
	$fdArticles = fopen($strInputFileName, "r");
	if ($intInputFileSize) { //fread will toss a warning on a 0 byte read.
		if ($fdArticles) { //ensure handle opened ok.
			while (!feof($fdArticles)) {
				$strArticles = fread($fdArticles, 8192); //8192 is fread()'s max byte length.

# Split $strArticles into $arrArticle array on "\n\n:", which immediately preceeds the title of each article.
# Also get rid of leading ":" at the start of the file, 
# so that after the split() all $arrArticle values start similarly, without a leading ":".
				$arrArticleBlock = split($strCRLF . $strCRLF . ":", $strArticles);
# Tack on the leftovers from the last block of text to the first element of this array.
				$arrArticleBlock[0] = $strArticleBlockRemainder . $arrArticleBlock[0];
# Update the leftovers with what's in this block of text, and remove it from the end of the array.
				$strArticleBlockRemainder = array_pop($arrArticleBlock);
# Add this block's articles to the big article array in the sky.

# Loop through each item in the array of articles.
# During loop we will search the article for data on the article's term and build the Wiki-formatted entry.
				foreach($arrArticleBlock as $strKeyArticleBlock => $strValueArticleBlock) {
					$intArticleCount++;
# Output wiki-formatted article text.
					$strArticle = parseArticle($strValueArticleBlock);
					if ($strOutputMode == "file") {
						fappend($strArticle, $strOutputFileName);
						echo "Article " . $intArticleCount . " successfully written to " . $strOutputFileName . "." . $strCRLF;
					} else {
						echo $strArticle;
					} //if
				
					if ($boolDebug) {
						echo "</div>";
						echo "</div>";
					} //if
				} //foreach
			} //while
# Since we always pop off the last element in the block array AFTER it has a chance to be
# parsed (required because except for the edge case we have to wait for the subsequent read
# [which takes place in the next iteration of the loop] in order to get the reast of the
# article text) in order to parse the whole article. The short of this is that we need one
# last parse call for the final article, which assuming that the file ends with a complete article
# should be a complete article (e.g. if there is no subsequent read, there is nothing to tack
# onto the end of the remainder string, and if our assumption that the file ends with a complete
# article is true, then since there is nothing to tack on we must assume that what we have in the
# remainder already IS a complete article, QED). Let's finish this pig.
		$intArticleCount++;
		$strArticle = parseArticle($strArticleBlockRemainder);
		if ($strOutputMode == "file") {
# Output wiki-formatted article text.
			fappend($strArticle, $strOutputFileName);
			echo "Article " . $intArticleCount . " successfully written to " . $strOutputFileName . "." . $strCRLF;
		} else {
			echo $strArticle;
		} //if
	
		if ($boolDebug) {
			echo "</div>";
			echo "</div>";
		} //if
		
		fclose($fdArticles);
		} //if
	} //if
} else {
	echo "Please pass a valid input filename in the arguments.";
} //if


exit;
?>

External links

Personal tools
Toolbox